diff --git a/[refs] b/[refs] index b745838783e8..722e4d82aa4d 100644 --- a/[refs] +++ b/[refs] @@ -1,2 +1,2 @@ --- -refs/heads/master: 7a1fb5d06d3936c0982e2cf8b53b046244a9aad6 +refs/heads/master: 8d7773a32d8aa723030712b0a500a4a402a21c85 diff --git a/trunk/Documentation/ABI/testing/sysfs-fs-ext4 b/trunk/Documentation/ABI/testing/sysfs-fs-ext4 deleted file mode 100644 index 4e79074de282..000000000000 --- a/trunk/Documentation/ABI/testing/sysfs-fs-ext4 +++ /dev/null @@ -1,81 +0,0 @@ -What: /sys/fs/ext4//mb_stats -Date: March 2008 -Contact: "Theodore Ts'o" -Description: - Controls whether the multiblock allocator should - collect statistics, which are shown during the unmount. - 1 means to collect statistics, 0 means not to collect - statistics - -What: /sys/fs/ext4//mb_group_prealloc -Date: March 2008 -Contact: "Theodore Ts'o" -Description: - The multiblock allocator will round up allocation - requests to a multiple of this tuning parameter if the - stripe size is not set in the ext4 superblock - -What: /sys/fs/ext4//mb_max_to_scan -Date: March 2008 -Contact: "Theodore Ts'o" -Description: - The maximum number of extents the multiblock allocator - will search to find the best extent - -What: /sys/fs/ext4//mb_min_to_scan -Date: March 2008 -Contact: "Theodore Ts'o" -Description: - The minimum number of extents the multiblock allocator - will search to find the best extent - -What: /sys/fs/ext4//mb_order2_req -Date: March 2008 -Contact: "Theodore Ts'o" -Description: - Tuning parameter which controls the minimum size for - requests (as a power of 2) where the buddy cache is - used - -What: /sys/fs/ext4//mb_stream_req -Date: March 2008 -Contact: "Theodore Ts'o" -Description: - Files which have fewer blocks than this tunable - parameter will have their blocks allocated out of a - block group specific preallocation pool, so that small - files are packed closely together. Each large file - will have its blocks allocated out of its own unique - preallocation pool. - -What: /sys/fs/ext4//inode_readahead -Date: March 2008 -Contact: "Theodore Ts'o" -Description: - Tuning parameter which controls the maximum number of - inode table blocks that ext4's inode table readahead - algorithm will pre-read into the buffer cache - -What: /sys/fs/ext4//delayed_allocation_blocks -Date: March 2008 -Contact: "Theodore Ts'o" -Description: - This file is read-only and shows the number of blocks - that are dirty in the page cache, but which do not - have their location in the filesystem allocated yet. - -What: /sys/fs/ext4//lifetime_write_kbytes -Date: March 2008 -Contact: "Theodore Ts'o" -Description: - This file is read-only and shows the number of kilobytes - of data that have been written to this filesystem since it was - created. - -What: /sys/fs/ext4//session_write_kbytes -Date: March 2008 -Contact: "Theodore Ts'o" -Description: - This file is read-only and shows the number of - kilobytes of data that have been written to this - filesystem since it was mounted. diff --git a/trunk/Documentation/filesystems/ext4.txt b/trunk/Documentation/filesystems/ext4.txt index 97882df04865..cec829bc7291 100644 --- a/trunk/Documentation/filesystems/ext4.txt +++ b/trunk/Documentation/filesystems/ext4.txt @@ -85,7 +85,7 @@ Note: More extensive information for getting started with ext4 can be * extent format more robust in face of on-disk corruption due to magics, * internal redundancy in tree * improved file allocation (multi-block alloc) -* lift 32000 subdirectory limit imposed by i_links_count[1] +* fix 32000 subdirectory limit * nsec timestamps for mtime, atime, ctime, create time * inode version field on disk (NFSv4, Lustre) * reduced e2fsck time via uninit_bg feature @@ -100,9 +100,6 @@ Note: More extensive information for getting started with ext4 can be * efficent new ordered mode in JBD2 and ext4(avoid using buffer head to force the ordering) -[1] Filesystems with a block size of 1k may see a limit imposed by the -directory hash tree having a maximum depth of two. - 2.2 Candidate features for future inclusion * Online defrag (patches available but not well tested) @@ -183,8 +180,8 @@ commit=nrsec (*) Ext4 can be told to sync all its data and metadata performance. barrier=<0|1(*)> This enables/disables the use of write barriers in -barrier(*) the jbd code. barrier=0 disables, barrier=1 enables. -nobarrier This also requires an IO stack which can support + the jbd code. barrier=0 disables, barrier=1 enables. + This also requires an IO stack which can support barriers, and if jbd gets an error on a barrier write, it will disable again with a warning. Write barriers enforce proper on-disk ordering @@ -192,9 +189,6 @@ nobarrier This also requires an IO stack which can support safe to use, at some performance penalty. If your disks are battery-backed in one way or another, disabling barriers may safely improve performance. - The mount options "barrier" and "nobarrier" can - also be used to enable or disable barriers, for - consistency with other ext4 mount options. inode_readahead=n This tuning parameter controls the maximum number of inode table blocks that ext4's inode @@ -316,24 +310,6 @@ journal_ioprio=prio The I/O priority (from 0 to 7, where 0 is the a slightly higher priority than the default I/O priority. -auto_da_alloc(*) Many broken applications don't use fsync() when -noauto_da_alloc replacing existing files via patterns such as - fd = open("foo.new")/write(fd,..)/close(fd)/ - rename("foo.new", "foo"), or worse yet, - fd = open("foo", O_TRUNC)/write(fd,..)/close(fd). - If auto_da_alloc is enabled, ext4 will detect - the replace-via-rename and replace-via-truncate - patterns and force that any delayed allocation - blocks are allocated such that at the next - journal commit, in the default data=ordered - mode, the data blocks of the new file are forced - to disk before the rename() operation is - commited. This provides roughly the same level - of guarantees as ext3, and avoids the - "zero-length" problem that can happen when a - system crashes before the delayed allocation - blocks are forced to disk. - Data Mode ========= There are 3 different data modes: diff --git a/trunk/Documentation/filesystems/proc.txt b/trunk/Documentation/filesystems/proc.txt index efc4fd9f40ce..830bad7cce0f 100644 --- a/trunk/Documentation/filesystems/proc.txt +++ b/trunk/Documentation/filesystems/proc.txt @@ -940,6 +940,27 @@ Table 1-10: Files in /proc/fs/ext4/ File Content mb_groups details of multiblock allocator buddy cache of free blocks mb_history multiblock allocation history + stats controls whether the multiblock allocator should start + collecting statistics, which are shown during the unmount + group_prealloc the multiblock allocator will round up allocation + requests to a multiple of this tuning parameter if the + stripe size is not set in the ext4 superblock + max_to_scan The maximum number of extents the multiblock allocator + will search to find the best extent + min_to_scan The minimum number of extents the multiblock allocator + will search to find the best extent + order2_req Tuning parameter which controls the minimum size for + requests (as a power of 2) where the buddy cache is + used + stream_req Files which have fewer blocks than this tunable + parameter will have their blocks allocated out of a + block group specific preallocation pool, so that small + files are packed closely together. Each large file + will have its blocks allocated out of its own unique + preallocation pool. +inode_readahead Tuning parameter which controls the maximum number of + inode table blocks that ext4's inode table readahead + algorithm will pre-read into the buffer cache .............................................................................. diff --git a/trunk/arch/ia64/include/asm/intrinsics.h b/trunk/arch/ia64/include/asm/intrinsics.h index 111ed5222892..c47830e26cb7 100644 --- a/trunk/arch/ia64/include/asm/intrinsics.h +++ b/trunk/arch/ia64/include/asm/intrinsics.h @@ -202,11 +202,7 @@ extern long ia64_cmpxchg_called_with_bad_pointer (void); #ifndef __ASSEMBLY__ #if defined(CONFIG_PARAVIRT) && defined(__KERNEL__) -#ifdef ASM_SUPPORTED -# define IA64_INTRINSIC_API(name) paravirt_ ## name -#else -# define IA64_INTRINSIC_API(name) pv_cpu_ops.name -#endif +#define IA64_INTRINSIC_API(name) pv_cpu_ops.name #define IA64_INTRINSIC_MACRO(name) paravirt_ ## name #else #define IA64_INTRINSIC_API(name) ia64_native_ ## name diff --git a/trunk/arch/ia64/include/asm/mmu_context.h b/trunk/arch/ia64/include/asm/mmu_context.h index 7f2a456603cb..040bc87db930 100644 --- a/trunk/arch/ia64/include/asm/mmu_context.h +++ b/trunk/arch/ia64/include/asm/mmu_context.h @@ -87,7 +87,7 @@ get_mmu_context (struct mm_struct *mm) /* re-check, now that we've got the lock: */ context = mm->context; if (context == 0) { - cpumask_clear(mm_cpumask(mm)); + cpus_clear(mm->cpu_vm_mask); if (ia64_ctx.next >= ia64_ctx.limit) { ia64_ctx.next = find_next_zero_bit(ia64_ctx.bitmap, ia64_ctx.max_ctx, ia64_ctx.next); @@ -166,8 +166,8 @@ activate_context (struct mm_struct *mm) do { context = get_mmu_context(mm); - if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(mm))) - cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm)); + if (!cpu_isset(smp_processor_id(), mm->cpu_vm_mask)) + cpu_set(smp_processor_id(), mm->cpu_vm_mask); reload_context(context); /* * in the unlikely event of a TLB-flush by another thread, diff --git a/trunk/arch/ia64/include/asm/module.h b/trunk/arch/ia64/include/asm/module.h index 908eaef42a08..d2da61e4c49b 100644 --- a/trunk/arch/ia64/include/asm/module.h +++ b/trunk/arch/ia64/include/asm/module.h @@ -16,12 +16,6 @@ struct mod_arch_specific { struct elf64_shdr *got; /* global offset table */ struct elf64_shdr *opd; /* official procedure descriptors */ struct elf64_shdr *unwind; /* unwind-table section */ -#ifdef CONFIG_PARAVIRT - struct elf64_shdr *paravirt_bundles; - /* paravirt_alt_bundle_patch table */ - struct elf64_shdr *paravirt_insts; - /* paravirt_alt_inst_patch table */ -#endif unsigned long gp; /* global-pointer for module */ void *core_unw_table; /* core unwind-table cookie returned by unwinder */ diff --git a/trunk/arch/ia64/include/asm/native/inst.h b/trunk/arch/ia64/include/asm/native/inst.h index d2d46efb3e6e..0a1026cca4fa 100644 --- a/trunk/arch/ia64/include/asm/native/inst.h +++ b/trunk/arch/ia64/include/asm/native/inst.h @@ -30,9 +30,6 @@ #define __paravirt_work_processed_syscall_target \ ia64_work_processed_syscall -#define paravirt_fsyscall_table ia64_native_fsyscall_table -#define paravirt_fsys_bubble_down ia64_native_fsys_bubble_down - #ifdef CONFIG_PARAVIRT_GUEST_ASM_CLOBBER_CHECK # define PARAVIRT_POISON 0xdeadbeefbaadf00d # define CLOBBER(clob) \ @@ -77,11 +74,6 @@ (pred) mov reg = psr \ CLOBBER(clob) -#define MOV_FROM_ITC(pred, pred_clob, reg, clob) \ -(pred) mov reg = ar.itc \ - CLOBBER(clob) \ - CLOBBER_PRED(pred_clob) - #define MOV_TO_IFA(reg, clob) \ mov cr.ifa = reg \ CLOBBER(clob) @@ -166,11 +158,6 @@ #define RSM_PSR_DT \ rsm psr.dt -#define RSM_PSR_BE_I(clob0, clob1) \ - rsm psr.be | psr.i \ - CLOBBER(clob0) \ - CLOBBER(clob1) - #define SSM_PSR_DT_AND_SRLZ_I \ ssm psr.dt \ ;; \ diff --git a/trunk/arch/ia64/include/asm/native/patchlist.h b/trunk/arch/ia64/include/asm/native/patchlist.h deleted file mode 100644 index be16ca9311bf..000000000000 --- a/trunk/arch/ia64/include/asm/native/patchlist.h +++ /dev/null @@ -1,38 +0,0 @@ -/****************************************************************************** - * arch/ia64/include/asm/native/inst.h - * - * Copyright (c) 2008 Isaku Yamahata - * VA Linux Systems Japan K.K. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - */ - -#define __paravirt_start_gate_fsyscall_patchlist \ - __ia64_native_start_gate_fsyscall_patchlist -#define __paravirt_end_gate_fsyscall_patchlist \ - __ia64_native_end_gate_fsyscall_patchlist -#define __paravirt_start_gate_brl_fsys_bubble_down_patchlist \ - __ia64_native_start_gate_brl_fsys_bubble_down_patchlist -#define __paravirt_end_gate_brl_fsys_bubble_down_patchlist \ - __ia64_native_end_gate_brl_fsys_bubble_down_patchlist -#define __paravirt_start_gate_vtop_patchlist \ - __ia64_native_start_gate_vtop_patchlist -#define __paravirt_end_gate_vtop_patchlist \ - __ia64_native_end_gate_vtop_patchlist -#define __paravirt_start_gate_mckinley_e9_patchlist \ - __ia64_native_start_gate_mckinley_e9_patchlist -#define __paravirt_end_gate_mckinley_e9_patchlist \ - __ia64_native_end_gate_mckinley_e9_patchlist diff --git a/trunk/arch/ia64/include/asm/native/pvchk_inst.h b/trunk/arch/ia64/include/asm/native/pvchk_inst.h index 8d72962ec838..b8e6eb1090d7 100644 --- a/trunk/arch/ia64/include/asm/native/pvchk_inst.h +++ b/trunk/arch/ia64/include/asm/native/pvchk_inst.h @@ -180,11 +180,6 @@ IS_PRED_IN(pred) \ IS_RREG_OUT(reg) \ IS_RREG_CLOB(clob) -#define MOV_FROM_ITC(pred, pred_clob, reg, clob) \ - IS_PRED_IN(pred) \ - IS_PRED_CLOB(pred_clob) \ - IS_RREG_OUT(reg) \ - IS_RREG_CLOB(clob) #define MOV_TO_IFA(reg, clob) \ IS_RREG_IN(reg) \ IS_RREG_CLOB(clob) @@ -251,9 +246,6 @@ IS_RREG_CLOB(clob2) #define RSM_PSR_DT \ nop 0 -#define RSM_PSR_BE_I(clob0, clob1) \ - IS_RREG_CLOB(clob0) \ - IS_RREG_CLOB(clob1) #define SSM_PSR_DT_AND_SRLZ_I \ nop 0 #define BSW_0(clob0, clob1, clob2) \ diff --git a/trunk/arch/ia64/include/asm/paravirt.h b/trunk/arch/ia64/include/asm/paravirt.h index 2eb0a981a09a..2bf3636473fe 100644 --- a/trunk/arch/ia64/include/asm/paravirt.h +++ b/trunk/arch/ia64/include/asm/paravirt.h @@ -22,56 +22,6 @@ #ifndef __ASM_PARAVIRT_H #define __ASM_PARAVIRT_H -#ifndef __ASSEMBLY__ -/****************************************************************************** - * fsys related addresses - */ -struct pv_fsys_data { - unsigned long *fsyscall_table; - void *fsys_bubble_down; -}; - -extern struct pv_fsys_data pv_fsys_data; - -unsigned long *paravirt_get_fsyscall_table(void); -char *paravirt_get_fsys_bubble_down(void); - -/****************************************************************************** - * patchlist addresses for gate page - */ -enum pv_gate_patchlist { - PV_GATE_START_FSYSCALL, - PV_GATE_END_FSYSCALL, - - PV_GATE_START_BRL_FSYS_BUBBLE_DOWN, - PV_GATE_END_BRL_FSYS_BUBBLE_DOWN, - - PV_GATE_START_VTOP, - PV_GATE_END_VTOP, - - PV_GATE_START_MCKINLEY_E9, - PV_GATE_END_MCKINLEY_E9, -}; - -struct pv_patchdata { - unsigned long start_fsyscall_patchlist; - unsigned long end_fsyscall_patchlist; - unsigned long start_brl_fsys_bubble_down_patchlist; - unsigned long end_brl_fsys_bubble_down_patchlist; - unsigned long start_vtop_patchlist; - unsigned long end_vtop_patchlist; - unsigned long start_mckinley_e9_patchlist; - unsigned long end_mckinley_e9_patchlist; - - void *gate_section; -}; - -extern struct pv_patchdata pv_patchdata; - -unsigned long paravirt_get_gate_patchlist(enum pv_gate_patchlist type); -void *paravirt_get_gate_section(void); -#endif - #ifdef CONFIG_PARAVIRT_GUEST #define PARAVIRT_HYPERVISOR_TYPE_DEFAULT 0 @@ -118,14 +68,6 @@ struct pv_init_ops { int (*arch_setup_nomca)(void); void (*post_smp_prepare_boot_cpu)(void); - -#ifdef ASM_SUPPORTED - unsigned long (*patch_bundle)(void *sbundle, void *ebundle, - unsigned long type); - unsigned long (*patch_inst)(unsigned long stag, unsigned long etag, - unsigned long type); -#endif - void (*patch_branch)(unsigned long tag, unsigned long type); }; extern struct pv_init_ops pv_init_ops; @@ -268,8 +210,6 @@ struct pv_time_ops { int (*do_steal_accounting)(unsigned long *new_itm); void (*clocksource_resume)(void); - - unsigned long long (*sched_clock)(void); }; extern struct pv_time_ops pv_time_ops; @@ -287,11 +227,6 @@ paravirt_do_steal_accounting(unsigned long *new_itm) return pv_time_ops.do_steal_accounting(new_itm); } -static inline unsigned long long paravirt_sched_clock(void) -{ - return pv_time_ops.sched_clock(); -} - #endif /* !__ASSEMBLY__ */ #else diff --git a/trunk/arch/ia64/include/asm/paravirt_patch.h b/trunk/arch/ia64/include/asm/paravirt_patch.h deleted file mode 100644 index 128ff5db6e67..000000000000 --- a/trunk/arch/ia64/include/asm/paravirt_patch.h +++ /dev/null @@ -1,143 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2008 Isaku Yamahata - * VA Linux Systems Japan K.K. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - */ - -#ifndef __ASM_PARAVIRT_PATCH_H -#define __ASM_PARAVIRT_PATCH_H - -#ifdef __ASSEMBLY__ - - .section .paravirt_branches, "a" - .previous -#define PARAVIRT_PATCH_SITE_BR(type) \ - { \ - [1:] ; \ - br.cond.sptk.many 2f ; \ - nop.b 0 ; \ - nop.b 0;; ; \ - } ; \ - 2: \ - .xdata8 ".paravirt_branches", 1b, type - -#else - -#include -#include - -/* for binary patch */ -struct paravirt_patch_site_bundle { - void *sbundle; - void *ebundle; - unsigned long type; -}; - -/* label means the beginning of new bundle */ -#define paravirt_alt_bundle(instr, privop) \ - "\t998:\n" \ - "\t" instr "\n" \ - "\t999:\n" \ - "\t.pushsection .paravirt_bundles, \"a\"\n" \ - "\t.popsection\n" \ - "\t.xdata8 \".paravirt_bundles\", 998b, 999b, " \ - __stringify(privop) "\n" - - -struct paravirt_patch_bundle_elem { - const void *sbundle; - const void *ebundle; - unsigned long type; -}; - - -struct paravirt_patch_site_inst { - unsigned long stag; - unsigned long etag; - unsigned long type; -}; - -#define paravirt_alt_inst(instr, privop) \ - "\t[998:]\n" \ - "\t" instr "\n" \ - "\t[999:]\n" \ - "\t.pushsection .paravirt_insts, \"a\"\n" \ - "\t.popsection\n" \ - "\t.xdata8 \".paravirt_insts\", 998b, 999b, " \ - __stringify(privop) "\n" - -struct paravirt_patch_site_branch { - unsigned long tag; - unsigned long type; -}; - -struct paravirt_patch_branch_target { - const void *entry; - unsigned long type; -}; - -void -__paravirt_patch_apply_branch( - unsigned long tag, unsigned long type, - const struct paravirt_patch_branch_target *entries, - unsigned int nr_entries); - -void -paravirt_patch_reloc_br(unsigned long tag, const void *target); - -void -paravirt_patch_reloc_brl(unsigned long tag, const void *target); - - -#if defined(ASM_SUPPORTED) && defined(CONFIG_PARAVIRT) -unsigned long -ia64_native_patch_bundle(void *sbundle, void *ebundle, unsigned long type); - -unsigned long -__paravirt_patch_apply_bundle(void *sbundle, void *ebundle, unsigned long type, - const struct paravirt_patch_bundle_elem *elems, - unsigned long nelems, - const struct paravirt_patch_bundle_elem **found); - -void -paravirt_patch_apply_bundle(const struct paravirt_patch_site_bundle *start, - const struct paravirt_patch_site_bundle *end); - -void -paravirt_patch_apply_inst(const struct paravirt_patch_site_inst *start, - const struct paravirt_patch_site_inst *end); - -void paravirt_patch_apply(void); -#else -#define paravirt_patch_apply_bundle(start, end) do { } while (0) -#define paravirt_patch_apply_inst(start, end) do { } while (0) -#define paravirt_patch_apply() do { } while (0) -#endif - -#endif /* !__ASSEMBLEY__ */ - -#endif /* __ASM_PARAVIRT_PATCH_H */ - -/* - * Local variables: - * mode: C - * c-set-style: "linux" - * c-basic-offset: 8 - * tab-width: 8 - * indent-tabs-mode: t - * End: - */ diff --git a/trunk/arch/ia64/include/asm/paravirt_privop.h b/trunk/arch/ia64/include/asm/paravirt_privop.h index 3d2951130b5f..33c8e55f5775 100644 --- a/trunk/arch/ia64/include/asm/paravirt_privop.h +++ b/trunk/arch/ia64/include/asm/paravirt_privop.h @@ -33,7 +33,7 @@ */ struct pv_cpu_ops { - void (*fc)(void *addr); + void (*fc)(unsigned long addr); unsigned long (*thash)(unsigned long addr); unsigned long (*get_cpuid)(int index); unsigned long (*get_pmd)(int index); @@ -60,18 +60,12 @@ extern unsigned long ia64_native_getreg_func(int regnum); /* Instructions paravirtualized for performance */ /************************************************/ -#ifndef ASM_SUPPORTED -#define paravirt_ssm_i() pv_cpu_ops.ssm_i() -#define paravirt_rsm_i() pv_cpu_ops.rsm_i() -#define __paravirt_getreg() pv_cpu_ops.getreg() -#endif - /* mask for ia64_native_ssm/rsm() must be constant.("i" constraing). * static inline function doesn't satisfy it. */ #define paravirt_ssm(mask) \ do { \ if ((mask) == IA64_PSR_I) \ - paravirt_ssm_i(); \ + pv_cpu_ops.ssm_i(); \ else \ ia64_native_ssm(mask); \ } while (0) @@ -79,7 +73,7 @@ extern unsigned long ia64_native_getreg_func(int regnum); #define paravirt_rsm(mask) \ do { \ if ((mask) == IA64_PSR_I) \ - paravirt_rsm_i(); \ + pv_cpu_ops.rsm_i(); \ else \ ia64_native_rsm(mask); \ } while (0) @@ -92,7 +86,7 @@ extern unsigned long ia64_native_getreg_func(int regnum); if ((reg) == _IA64_REG_IP) \ res = ia64_native_getreg(_IA64_REG_IP); \ else \ - res = __paravirt_getreg(reg); \ + res = pv_cpu_ops.getreg(reg); \ res; \ }) @@ -118,12 +112,6 @@ void paravirt_cpu_asm_init(const struct pv_cpu_asm_switch *cpu_asm_switch); #endif /* CONFIG_PARAVIRT */ -#if defined(CONFIG_PARAVIRT) && defined(ASM_SUPPORTED) -#define paravirt_dv_serialize_data() ia64_dv_serialize_data() -#else -#define paravirt_dv_serialize_data() /* nothing */ -#endif - /* these routines utilize privilege-sensitive or performance-sensitive * privileged instructions so the code must be replaced with * paravirtualized versions */ @@ -133,349 +121,4 @@ void paravirt_cpu_asm_init(const struct pv_cpu_asm_switch *cpu_asm_switch); IA64_PARAVIRT_ASM_FUNC(work_processed_syscall) #define ia64_leave_kernel IA64_PARAVIRT_ASM_FUNC(leave_kernel) - -#if defined(CONFIG_PARAVIRT) -/****************************************************************************** - * binary patching infrastructure - */ -#define PARAVIRT_PATCH_TYPE_FC 1 -#define PARAVIRT_PATCH_TYPE_THASH 2 -#define PARAVIRT_PATCH_TYPE_GET_CPUID 3 -#define PARAVIRT_PATCH_TYPE_GET_PMD 4 -#define PARAVIRT_PATCH_TYPE_PTCGA 5 -#define PARAVIRT_PATCH_TYPE_GET_RR 6 -#define PARAVIRT_PATCH_TYPE_SET_RR 7 -#define PARAVIRT_PATCH_TYPE_SET_RR0_TO_RR4 8 -#define PARAVIRT_PATCH_TYPE_SSM_I 9 -#define PARAVIRT_PATCH_TYPE_RSM_I 10 -#define PARAVIRT_PATCH_TYPE_GET_PSR_I 11 -#define PARAVIRT_PATCH_TYPE_INTRIN_LOCAL_IRQ_RESTORE 12 - -/* PARAVIRT_PATY_TYPE_[GS]ETREG + _IA64_REG_xxx */ -#define PARAVIRT_PATCH_TYPE_GETREG 0x10000000 -#define PARAVIRT_PATCH_TYPE_SETREG 0x20000000 - -/* - * struct task_struct* (*ia64_switch_to)(void* next_task); - * void *ia64_leave_syscall; - * void *ia64_work_processed_syscall - * void *ia64_leave_kernel; - */ - -#define PARAVIRT_PATCH_TYPE_BR_START 0x30000000 -#define PARAVIRT_PATCH_TYPE_BR_SWITCH_TO \ - (PARAVIRT_PATCH_TYPE_BR_START + 0) -#define PARAVIRT_PATCH_TYPE_BR_LEAVE_SYSCALL \ - (PARAVIRT_PATCH_TYPE_BR_START + 1) -#define PARAVIRT_PATCH_TYPE_BR_WORK_PROCESSED_SYSCALL \ - (PARAVIRT_PATCH_TYPE_BR_START + 2) -#define PARAVIRT_PATCH_TYPE_BR_LEAVE_KERNEL \ - (PARAVIRT_PATCH_TYPE_BR_START + 3) - -#ifdef ASM_SUPPORTED -#include - -/* - * pv_cpu_ops calling stub. - * normal function call convension can't be written by gcc - * inline assembly. - * - * from the caller's point of view, - * the following registers will be clobbered. - * r2, r3 - * r8-r15 - * r16, r17 - * b6, b7 - * p6-p15 - * ar.ccv - * - * from the callee's point of view , - * the following registers can be used. - * r2, r3: scratch - * r8: scratch, input argument0 and return value - * r0-r15: scratch, input argument1-5 - * b6: return pointer - * b7: scratch - * p6-p15: scratch - * ar.ccv: scratch - * - * other registers must not be changed. especially - * b0: rp: preserved. gcc ignores b0 in clobbered register. - * r16: saved gp - */ -/* 5 bundles */ -#define __PARAVIRT_BR \ - ";;\n" \ - "{ .mlx\n" \ - "nop 0\n" \ - "movl r2 = %[op_addr]\n"/* get function pointer address */ \ - ";;\n" \ - "}\n" \ - "1:\n" \ - "{ .mii\n" \ - "ld8 r2 = [r2]\n" /* load function descriptor address */ \ - "mov r17 = ip\n" /* get ip to calc return address */ \ - "mov r16 = gp\n" /* save gp */ \ - ";;\n" \ - "}\n" \ - "{ .mii\n" \ - "ld8 r3 = [r2], 8\n" /* load entry address */ \ - "adds r17 = 1f - 1b, r17\n" /* calculate return address */ \ - ";;\n" \ - "mov b7 = r3\n" /* set entry address */ \ - "}\n" \ - "{ .mib\n" \ - "ld8 gp = [r2]\n" /* load gp value */ \ - "mov b6 = r17\n" /* set return address */ \ - "br.cond.sptk.few b7\n" /* intrinsics are very short isns */ \ - "}\n" \ - "1:\n" \ - "{ .mii\n" \ - "mov gp = r16\n" /* restore gp value */ \ - "nop 0\n" \ - "nop 0\n" \ - ";;\n" \ - "}\n" - -#define PARAVIRT_OP(op) \ - [op_addr] "i"(&pv_cpu_ops.op) - -#define PARAVIRT_TYPE(type) \ - PARAVIRT_PATCH_TYPE_ ## type - -#define PARAVIRT_REG_CLOBBERS0 \ - "r2", "r3", /*"r8",*/ "r9", "r10", "r11", "r14", \ - "r15", "r16", "r17" - -#define PARAVIRT_REG_CLOBBERS1 \ - "r2","r3", /*"r8",*/ "r9", "r10", "r11", "r14", \ - "r15", "r16", "r17" - -#define PARAVIRT_REG_CLOBBERS2 \ - "r2", "r3", /*"r8", "r9",*/ "r10", "r11", "r14", \ - "r15", "r16", "r17" - -#define PARAVIRT_REG_CLOBBERS5 \ - "r2", "r3", /*"r8", "r9", "r10", "r11", "r14",*/ \ - "r15", "r16", "r17" - -#define PARAVIRT_BR_CLOBBERS \ - "b6", "b7" - -#define PARAVIRT_PR_CLOBBERS \ - "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15" - -#define PARAVIRT_AR_CLOBBERS \ - "ar.ccv" - -#define PARAVIRT_CLOBBERS0 \ - PARAVIRT_REG_CLOBBERS0, \ - PARAVIRT_BR_CLOBBERS, \ - PARAVIRT_PR_CLOBBERS, \ - PARAVIRT_AR_CLOBBERS, \ - "memory" - -#define PARAVIRT_CLOBBERS1 \ - PARAVIRT_REG_CLOBBERS1, \ - PARAVIRT_BR_CLOBBERS, \ - PARAVIRT_PR_CLOBBERS, \ - PARAVIRT_AR_CLOBBERS, \ - "memory" - -#define PARAVIRT_CLOBBERS2 \ - PARAVIRT_REG_CLOBBERS2, \ - PARAVIRT_BR_CLOBBERS, \ - PARAVIRT_PR_CLOBBERS, \ - PARAVIRT_AR_CLOBBERS, \ - "memory" - -#define PARAVIRT_CLOBBERS5 \ - PARAVIRT_REG_CLOBBERS5, \ - PARAVIRT_BR_CLOBBERS, \ - PARAVIRT_PR_CLOBBERS, \ - PARAVIRT_AR_CLOBBERS, \ - "memory" - -#define PARAVIRT_BR0(op, type) \ - register unsigned long ia64_clobber asm ("r8"); \ - asm volatile (paravirt_alt_bundle(__PARAVIRT_BR, \ - PARAVIRT_TYPE(type)) \ - : "=r"(ia64_clobber) \ - : PARAVIRT_OP(op) \ - : PARAVIRT_CLOBBERS0) - -#define PARAVIRT_BR0_RET(op, type) \ - register unsigned long ia64_intri_res asm ("r8"); \ - asm volatile (paravirt_alt_bundle(__PARAVIRT_BR, \ - PARAVIRT_TYPE(type)) \ - : "=r"(ia64_intri_res) \ - : PARAVIRT_OP(op) \ - : PARAVIRT_CLOBBERS0) - -#define PARAVIRT_BR1(op, type, arg1) \ - register unsigned long __##arg1 asm ("r8") = arg1; \ - register unsigned long ia64_clobber asm ("r8"); \ - asm volatile (paravirt_alt_bundle(__PARAVIRT_BR, \ - PARAVIRT_TYPE(type)) \ - : "=r"(ia64_clobber) \ - : PARAVIRT_OP(op), "0"(__##arg1) \ - : PARAVIRT_CLOBBERS1) - -#define PARAVIRT_BR1_RET(op, type, arg1) \ - register unsigned long ia64_intri_res asm ("r8"); \ - register unsigned long __##arg1 asm ("r8") = arg1; \ - asm volatile (paravirt_alt_bundle(__PARAVIRT_BR, \ - PARAVIRT_TYPE(type)) \ - : "=r"(ia64_intri_res) \ - : PARAVIRT_OP(op), "0"(__##arg1) \ - : PARAVIRT_CLOBBERS1) - -#define PARAVIRT_BR1_VOID(op, type, arg1) \ - register void *__##arg1 asm ("r8") = arg1; \ - register unsigned long ia64_clobber asm ("r8"); \ - asm volatile (paravirt_alt_bundle(__PARAVIRT_BR, \ - PARAVIRT_TYPE(type)) \ - : "=r"(ia64_clobber) \ - : PARAVIRT_OP(op), "0"(__##arg1) \ - : PARAVIRT_CLOBBERS1) - -#define PARAVIRT_BR2(op, type, arg1, arg2) \ - register unsigned long __##arg1 asm ("r8") = arg1; \ - register unsigned long __##arg2 asm ("r9") = arg2; \ - register unsigned long ia64_clobber1 asm ("r8"); \ - register unsigned long ia64_clobber2 asm ("r9"); \ - asm volatile (paravirt_alt_bundle(__PARAVIRT_BR, \ - PARAVIRT_TYPE(type)) \ - : "=r"(ia64_clobber1), "=r"(ia64_clobber2) \ - : PARAVIRT_OP(op), "0"(__##arg1), "1"(__##arg2) \ - : PARAVIRT_CLOBBERS2) - - -#define PARAVIRT_DEFINE_CPU_OP0(op, type) \ - static inline void \ - paravirt_ ## op (void) \ - { \ - PARAVIRT_BR0(op, type); \ - } - -#define PARAVIRT_DEFINE_CPU_OP0_RET(op, type) \ - static inline unsigned long \ - paravirt_ ## op (void) \ - { \ - PARAVIRT_BR0_RET(op, type); \ - return ia64_intri_res; \ - } - -#define PARAVIRT_DEFINE_CPU_OP1_VOID(op, type) \ - static inline void \ - paravirt_ ## op (void *arg1) \ - { \ - PARAVIRT_BR1_VOID(op, type, arg1); \ - } - -#define PARAVIRT_DEFINE_CPU_OP1(op, type) \ - static inline void \ - paravirt_ ## op (unsigned long arg1) \ - { \ - PARAVIRT_BR1(op, type, arg1); \ - } - -#define PARAVIRT_DEFINE_CPU_OP1_RET(op, type) \ - static inline unsigned long \ - paravirt_ ## op (unsigned long arg1) \ - { \ - PARAVIRT_BR1_RET(op, type, arg1); \ - return ia64_intri_res; \ - } - -#define PARAVIRT_DEFINE_CPU_OP2(op, type) \ - static inline void \ - paravirt_ ## op (unsigned long arg1, \ - unsigned long arg2) \ - { \ - PARAVIRT_BR2(op, type, arg1, arg2); \ - } - - -PARAVIRT_DEFINE_CPU_OP1_VOID(fc, FC); -PARAVIRT_DEFINE_CPU_OP1_RET(thash, THASH) -PARAVIRT_DEFINE_CPU_OP1_RET(get_cpuid, GET_CPUID) -PARAVIRT_DEFINE_CPU_OP1_RET(get_pmd, GET_PMD) -PARAVIRT_DEFINE_CPU_OP2(ptcga, PTCGA) -PARAVIRT_DEFINE_CPU_OP1_RET(get_rr, GET_RR) -PARAVIRT_DEFINE_CPU_OP2(set_rr, SET_RR) -PARAVIRT_DEFINE_CPU_OP0(ssm_i, SSM_I) -PARAVIRT_DEFINE_CPU_OP0(rsm_i, RSM_I) -PARAVIRT_DEFINE_CPU_OP0_RET(get_psr_i, GET_PSR_I) -PARAVIRT_DEFINE_CPU_OP1(intrin_local_irq_restore, INTRIN_LOCAL_IRQ_RESTORE) - -static inline void -paravirt_set_rr0_to_rr4(unsigned long val0, unsigned long val1, - unsigned long val2, unsigned long val3, - unsigned long val4) -{ - register unsigned long __val0 asm ("r8") = val0; - register unsigned long __val1 asm ("r9") = val1; - register unsigned long __val2 asm ("r10") = val2; - register unsigned long __val3 asm ("r11") = val3; - register unsigned long __val4 asm ("r14") = val4; - - register unsigned long ia64_clobber0 asm ("r8"); - register unsigned long ia64_clobber1 asm ("r9"); - register unsigned long ia64_clobber2 asm ("r10"); - register unsigned long ia64_clobber3 asm ("r11"); - register unsigned long ia64_clobber4 asm ("r14"); - - asm volatile (paravirt_alt_bundle(__PARAVIRT_BR, - PARAVIRT_TYPE(SET_RR0_TO_RR4)) - : "=r"(ia64_clobber0), - "=r"(ia64_clobber1), - "=r"(ia64_clobber2), - "=r"(ia64_clobber3), - "=r"(ia64_clobber4) - : PARAVIRT_OP(set_rr0_to_rr4), - "0"(__val0), "1"(__val1), "2"(__val2), - "3"(__val3), "4"(__val4) - : PARAVIRT_CLOBBERS5); -} - -/* unsigned long paravirt_getreg(int reg) */ -#define __paravirt_getreg(reg) \ - ({ \ - register unsigned long ia64_intri_res asm ("r8"); \ - register unsigned long __reg asm ("r8") = (reg); \ - \ - BUILD_BUG_ON(!__builtin_constant_p(reg)); \ - asm volatile (paravirt_alt_bundle(__PARAVIRT_BR, \ - PARAVIRT_TYPE(GETREG) \ - + (reg)) \ - : "=r"(ia64_intri_res) \ - : PARAVIRT_OP(getreg), "0"(__reg) \ - : PARAVIRT_CLOBBERS1); \ - \ - ia64_intri_res; \ - }) - -/* void paravirt_setreg(int reg, unsigned long val) */ -#define paravirt_setreg(reg, val) \ - do { \ - register unsigned long __val asm ("r8") = val; \ - register unsigned long __reg asm ("r9") = reg; \ - register unsigned long ia64_clobber1 asm ("r8"); \ - register unsigned long ia64_clobber2 asm ("r9"); \ - \ - BUILD_BUG_ON(!__builtin_constant_p(reg)); \ - asm volatile (paravirt_alt_bundle(__PARAVIRT_BR, \ - PARAVIRT_TYPE(SETREG) \ - + (reg)) \ - : "=r"(ia64_clobber1), \ - "=r"(ia64_clobber2) \ - : PARAVIRT_OP(setreg), \ - "1"(__reg), "0"(__val) \ - : PARAVIRT_CLOBBERS2); \ - } while (0) - -#endif /* ASM_SUPPORTED */ -#endif /* CONFIG_PARAVIRT && ASM_SUPPOTED */ - #endif /* _ASM_IA64_PARAVIRT_PRIVOP_H */ diff --git a/trunk/arch/ia64/include/asm/smp.h b/trunk/arch/ia64/include/asm/smp.h index 598408336251..21c402365d0e 100644 --- a/trunk/arch/ia64/include/asm/smp.h +++ b/trunk/arch/ia64/include/asm/smp.h @@ -126,8 +126,7 @@ extern void identify_siblings (struct cpuinfo_ia64 *); extern int is_multithreading_enabled(void); extern void arch_send_call_function_single_ipi(int cpu); -extern void arch_send_call_function_ipi_mask(const struct cpumask *mask); -#define arch_send_call_function_ipi_mask arch_send_call_function_ipi_mask +extern void arch_send_call_function_ipi(cpumask_t mask); #else /* CONFIG_SMP */ diff --git a/trunk/arch/ia64/include/asm/timex.h b/trunk/arch/ia64/include/asm/timex.h index 86c7db861180..4e03cfe74a0c 100644 --- a/trunk/arch/ia64/include/asm/timex.h +++ b/trunk/arch/ia64/include/asm/timex.h @@ -40,6 +40,5 @@ get_cycles (void) } extern void ia64_cpu_local_tick (void); -extern unsigned long long ia64_native_sched_clock (void); #endif /* _ASM_IA64_TIMEX_H */ diff --git a/trunk/arch/ia64/include/asm/topology.h b/trunk/arch/ia64/include/asm/topology.h index 7b4c8c70b2d1..f260dcf21515 100644 --- a/trunk/arch/ia64/include/asm/topology.h +++ b/trunk/arch/ia64/include/asm/topology.h @@ -112,6 +112,11 @@ void build_cpu_to_node_map(void); extern void arch_fix_phys_package_id(int num, u32 slot); +#define pcibus_to_cpumask(bus) (pcibus_to_node(bus) == -1 ? \ + CPU_MASK_ALL : \ + node_to_cpumask(pcibus_to_node(bus)) \ + ) + #define cpumask_of_pcibus(bus) (pcibus_to_node(bus) == -1 ? \ cpu_all_mask : \ cpumask_of_node(pcibus_to_node(bus))) diff --git a/trunk/arch/ia64/include/asm/xen/hypervisor.h b/trunk/arch/ia64/include/asm/xen/hypervisor.h index e425227a418e..7a804e80fc67 100644 --- a/trunk/arch/ia64/include/asm/xen/hypervisor.h +++ b/trunk/arch/ia64/include/asm/xen/hypervisor.h @@ -33,6 +33,9 @@ #ifndef _ASM_IA64_XEN_HYPERVISOR_H #define _ASM_IA64_XEN_HYPERVISOR_H +#ifdef CONFIG_XEN + +#include #include #include /* to compile feature.c */ #include /* to comiple xen-netfront.c */ @@ -40,32 +43,22 @@ /* xen_domain_type is set before executing any C code by early_xen_setup */ enum xen_domain_type { - XEN_NATIVE, /* running on bare hardware */ - XEN_PV_DOMAIN, /* running in a PV domain */ - XEN_HVM_DOMAIN, /* running in a Xen hvm domain*/ + XEN_NATIVE, + XEN_PV_DOMAIN, + XEN_HVM_DOMAIN, }; -#ifdef CONFIG_XEN extern enum xen_domain_type xen_domain_type; -#else -#define xen_domain_type XEN_NATIVE -#endif #define xen_domain() (xen_domain_type != XEN_NATIVE) -#define xen_pv_domain() (xen_domain() && \ - xen_domain_type == XEN_PV_DOMAIN) -#define xen_hvm_domain() (xen_domain() && \ - xen_domain_type == XEN_HVM_DOMAIN) - -#ifdef CONFIG_XEN_DOM0 -#define xen_initial_domain() (xen_pv_domain() && \ +#define xen_pv_domain() (xen_domain_type == XEN_PV_DOMAIN) +#define xen_initial_domain() (xen_pv_domain() && \ (xen_start_info->flags & SIF_INITDOMAIN)) -#else -#define xen_initial_domain() (0) -#endif +#define xen_hvm_domain() (xen_domain_type == XEN_HVM_DOMAIN) +/* deprecated. remove this */ +#define is_running_on_xen() (xen_domain_type == XEN_PV_DOMAIN) -#ifdef CONFIG_XEN extern struct shared_info *HYPERVISOR_shared_info; extern struct start_info *xen_start_info; @@ -81,6 +74,16 @@ void force_evtchn_callback(void); /* For setup_arch() in arch/ia64/kernel/setup.c */ void xen_ia64_enable_opt_feature(void); + +#else /* CONFIG_XEN */ + +#define xen_domain() (0) +#define xen_pv_domain() (0) +#define xen_initial_domain() (0) +#define xen_hvm_domain() (0) +#define is_running_on_xen() (0) /* deprecated. remove this */ #endif +#define is_initial_xendomain() (0) /* deprecated. remove this */ + #endif /* _ASM_IA64_XEN_HYPERVISOR_H */ diff --git a/trunk/arch/ia64/include/asm/xen/inst.h b/trunk/arch/ia64/include/asm/xen/inst.h index c53a47611208..19c2ae1d878a 100644 --- a/trunk/arch/ia64/include/asm/xen/inst.h +++ b/trunk/arch/ia64/include/asm/xen/inst.h @@ -33,9 +33,6 @@ #define __paravirt_work_processed_syscall_target \ xen_work_processed_syscall -#define paravirt_fsyscall_table xen_fsyscall_table -#define paravirt_fsys_bubble_down xen_fsys_bubble_down - #define MOV_FROM_IFA(reg) \ movl reg = XSI_IFA; \ ;; \ @@ -113,27 +110,6 @@ .endm #define MOV_FROM_PSR(pred, reg, clob) __MOV_FROM_PSR pred, reg, clob -/* assuming ar.itc is read with interrupt disabled. */ -#define MOV_FROM_ITC(pred, pred_clob, reg, clob) \ -(pred) movl clob = XSI_ITC_OFFSET; \ - ;; \ -(pred) ld8 clob = [clob]; \ -(pred) mov reg = ar.itc; \ - ;; \ -(pred) add reg = reg, clob; \ - ;; \ -(pred) movl clob = XSI_ITC_LAST; \ - ;; \ -(pred) ld8 clob = [clob]; \ - ;; \ -(pred) cmp.geu.unc pred_clob, p0 = clob, reg; \ - ;; \ -(pred_clob) add reg = 1, clob; \ - ;; \ -(pred) movl clob = XSI_ITC_LAST; \ - ;; \ -(pred) st8 [clob] = reg - #define MOV_TO_IFA(reg, clob) \ movl clob = XSI_IFA; \ @@ -386,10 +362,6 @@ #define RSM_PSR_DT \ XEN_HYPER_RSM_PSR_DT -#define RSM_PSR_BE_I(clob0, clob1) \ - RSM_PSR_I(p0, clob0, clob1); \ - rum psr.be - #define SSM_PSR_DT_AND_SRLZ_I \ XEN_HYPER_SSM_PSR_DT diff --git a/trunk/arch/ia64/include/asm/xen/interface.h b/trunk/arch/ia64/include/asm/xen/interface.h index e951e740bdf2..f00fab40854d 100644 --- a/trunk/arch/ia64/include/asm/xen/interface.h +++ b/trunk/arch/ia64/include/asm/xen/interface.h @@ -209,15 +209,6 @@ struct mapped_regs { unsigned long krs[8]; /* kernel registers */ unsigned long tmp[16]; /* temp registers (e.g. for hyperprivops) */ - - /* itc paravirtualization - * vAR.ITC = mAR.ITC + itc_offset - * itc_last is one which was lastly passed to - * the guest OS in order to prevent it from - * going backwords. - */ - unsigned long itc_offset; - unsigned long itc_last; }; }; }; diff --git a/trunk/arch/ia64/include/asm/xen/minstate.h b/trunk/arch/ia64/include/asm/xen/minstate.h index c57fa910f2c9..4d92d9bbda7b 100644 --- a/trunk/arch/ia64/include/asm/xen/minstate.h +++ b/trunk/arch/ia64/include/asm/xen/minstate.h @@ -1,12 +1,3 @@ - -#ifdef CONFIG_VIRT_CPU_ACCOUNTING -/* read ar.itc in advance, and use it before leaving bank 0 */ -#define XEN_ACCOUNT_GET_STAMP \ - MOV_FROM_ITC(pUStk, p6, r20, r2); -#else -#define XEN_ACCOUNT_GET_STAMP -#endif - /* * DO_SAVE_MIN switches to the kernel stacks (if necessary) and saves * the minimum state necessary that allows us to turn psr.ic back @@ -132,7 +123,7 @@ ;; \ .mem.offset 0,0; st8.spill [r16]=r2,16; \ .mem.offset 8,0; st8.spill [r17]=r3,16; \ - XEN_ACCOUNT_GET_STAMP \ + ACCOUNT_GET_STAMP \ adds r2=IA64_PT_REGS_R16_OFFSET,r1; \ ;; \ EXTRA; \ diff --git a/trunk/arch/ia64/include/asm/xen/patchlist.h b/trunk/arch/ia64/include/asm/xen/patchlist.h deleted file mode 100644 index eae944e88846..000000000000 --- a/trunk/arch/ia64/include/asm/xen/patchlist.h +++ /dev/null @@ -1,38 +0,0 @@ -/****************************************************************************** - * arch/ia64/include/asm/xen/patchlist.h - * - * Copyright (c) 2008 Isaku Yamahata - * VA Linux Systems Japan K.K. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - */ - -#define __paravirt_start_gate_fsyscall_patchlist \ - __xen_start_gate_fsyscall_patchlist -#define __paravirt_end_gate_fsyscall_patchlist \ - __xen_end_gate_fsyscall_patchlist -#define __paravirt_start_gate_brl_fsys_bubble_down_patchlist \ - __xen_start_gate_brl_fsys_bubble_down_patchlist -#define __paravirt_end_gate_brl_fsys_bubble_down_patchlist \ - __xen_end_gate_brl_fsys_bubble_down_patchlist -#define __paravirt_start_gate_vtop_patchlist \ - __xen_start_gate_vtop_patchlist -#define __paravirt_end_gate_vtop_patchlist \ - __xen_end_gate_vtop_patchlist -#define __paravirt_start_gate_mckinley_e9_patchlist \ - __xen_start_gate_mckinley_e9_patchlist -#define __paravirt_end_gate_mckinley_e9_patchlist \ - __xen_end_gate_mckinley_e9_patchlist diff --git a/trunk/arch/ia64/include/asm/xen/privop.h b/trunk/arch/ia64/include/asm/xen/privop.h index fb4ec5e0b066..71ec7546e100 100644 --- a/trunk/arch/ia64/include/asm/xen/privop.h +++ b/trunk/arch/ia64/include/asm/xen/privop.h @@ -55,8 +55,6 @@ #define XSI_BANK1_R16 (XSI_BASE + XSI_BANK1_R16_OFS) #define XSI_BANKNUM (XSI_BASE + XSI_BANKNUM_OFS) #define XSI_IHA (XSI_BASE + XSI_IHA_OFS) -#define XSI_ITC_OFFSET (XSI_BASE + XSI_ITC_OFFSET_OFS) -#define XSI_ITC_LAST (XSI_BASE + XSI_ITC_LAST_OFS) #endif #ifndef __ASSEMBLY__ @@ -69,7 +67,7 @@ * may have different semantics depending on whether they are executed * at PL0 vs PL!=0. When paravirtualized, these instructions mustn't * be allowed to execute directly, lest incorrect semantics result. */ -extern void xen_fc(void *addr); +extern void xen_fc(unsigned long addr); extern unsigned long xen_thash(unsigned long addr); /* Note that "ttag" and "cover" are also privilege-sensitive; "ttag" @@ -82,10 +80,8 @@ extern unsigned long xen_thash(unsigned long addr); extern unsigned long xen_get_cpuid(int index); extern unsigned long xen_get_pmd(int index); -#ifndef ASM_SUPPORTED extern unsigned long xen_get_eflag(void); /* see xen_ia64_getreg */ extern void xen_set_eflag(unsigned long); /* see xen_ia64_setreg */ -#endif /************************************************/ /* Instructions paravirtualized for performance */ @@ -110,7 +106,6 @@ extern void xen_set_eflag(unsigned long); /* see xen_ia64_setreg */ #define xen_get_virtual_pend() \ (*(((uint8_t *)XEN_MAPPEDREGS->interrupt_mask_addr) - 1)) -#ifndef ASM_SUPPORTED /* Although all privileged operations can be left to trap and will * be properly handled by Xen, some are frequent enough that we use * hyperprivops for performance. */ @@ -128,7 +123,6 @@ extern void xen_set_rr0_to_rr4(unsigned long val0, unsigned long val1, unsigned long val4); extern void xen_set_kr(unsigned long index, unsigned long val); extern void xen_ptcga(unsigned long addr, unsigned long size); -#endif /* !ASM_SUPPORTED */ #endif /* !__ASSEMBLY__ */ diff --git a/trunk/arch/ia64/kernel/Makefile b/trunk/arch/ia64/kernel/Makefile index 5628e9a990a6..f2778f2c4fd9 100644 --- a/trunk/arch/ia64/kernel/Makefile +++ b/trunk/arch/ia64/kernel/Makefile @@ -5,7 +5,7 @@ extra-y := head.o init_task.o vmlinux.lds obj-y := acpi.o entry.o efi.o efi_stub.o gate-data.o fsys.o ia64_ksyms.o irq.o irq_ia64.o \ - irq_lsapic.o ivt.o machvec.o pal.o paravirt_patchlist.o patch.o process.o perfmon.o ptrace.o sal.o \ + irq_lsapic.o ivt.o machvec.o pal.o patch.o process.o perfmon.o ptrace.o sal.o \ salinfo.o setup.o signal.o sys_ia64.o time.o traps.o unaligned.o \ unwind.o mca.o mca_asm.o topology.o dma-mapping.o @@ -36,8 +36,7 @@ obj-$(CONFIG_PCI_MSI) += msi_ia64.o mca_recovery-y += mca_drv.o mca_drv_asm.o obj-$(CONFIG_IA64_MC_ERR_INJECT)+= err_inject.o -obj-$(CONFIG_PARAVIRT) += paravirt.o paravirtentry.o \ - paravirt_patch.o +obj-$(CONFIG_PARAVIRT) += paravirt.o paravirtentry.o obj-$(CONFIG_IA64_ESI) += esi.o ifneq ($(CONFIG_IA64_ESI),) @@ -46,13 +45,35 @@ endif obj-$(CONFIG_DMAR) += pci-dma.o obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o +# The gate DSO image is built using a special linker script. +targets += gate.so gate-syms.o + +extra-y += gate.so gate-syms.o gate.lds gate.o + # fp_emulate() expects f2-f5,f16-f31 to contain the user-level state. CFLAGS_traps.o += -mfixed-range=f2-f5,f16-f31 -# The gate DSO image is built using a special linker script. -include $(srctree)/arch/ia64/kernel/Makefile.gate -# tell compiled for native -CPPFLAGS_gate.lds += -D__IA64_GATE_PARAVIRTUALIZED_NATIVE +CPPFLAGS_gate.lds := -P -C -U$(ARCH) + +quiet_cmd_gate = GATE $@ + cmd_gate = $(CC) -nostdlib $(GATECFLAGS_$(@F)) -Wl,-T,$(filter-out FORCE,$^) -o $@ + +GATECFLAGS_gate.so = -shared -s -Wl,-soname=linux-gate.so.1 \ + $(call ld-option, -Wl$(comma)--hash-style=sysv) +$(obj)/gate.so: $(obj)/gate.lds $(obj)/gate.o FORCE + $(call if_changed,gate) + +$(obj)/built-in.o: $(obj)/gate-syms.o +$(obj)/built-in.o: ld_flags += -R $(obj)/gate-syms.o + +GATECFLAGS_gate-syms.o = -r +$(obj)/gate-syms.o: $(obj)/gate.lds $(obj)/gate.o FORCE + $(call if_changed,gate) + +# gate-data.o contains the gate DSO image as data in section .data.gate. +# We must build gate.so before we can assemble it. +# Note: kbuild does not track this dependency due to usage of .incbin +$(obj)/gate-data.o: $(obj)/gate.so # Calculate NR_IRQ = max(IA64_NATIVE_NR_IRQS, XEN_NR_IRQS, ...) based on config define sed-y @@ -88,9 +109,9 @@ include/asm-ia64/nr-irqs.h: arch/$(SRCARCH)/kernel/nr-irqs.s clean-files += $(objtree)/include/asm-ia64/nr-irqs.h # -# native ivt.S, entry.S and fsys.S +# native ivt.S and entry.S # -ASM_PARAVIRT_OBJS = ivt.o entry.o fsys.o +ASM_PARAVIRT_OBJS = ivt.o entry.o define paravirtualized_native AFLAGS_$(1) += -D__IA64_ASM_PARAVIRTUALIZED_NATIVE AFLAGS_pvchk-sed-$(1) += -D__IA64_ASM_PARAVIRTUALIZED_PVCHECK diff --git a/trunk/arch/ia64/kernel/Makefile.gate b/trunk/arch/ia64/kernel/Makefile.gate deleted file mode 100644 index 1d87f84069b3..000000000000 --- a/trunk/arch/ia64/kernel/Makefile.gate +++ /dev/null @@ -1,27 +0,0 @@ -# The gate DSO image is built using a special linker script. - -targets += gate.so gate-syms.o - -extra-y += gate.so gate-syms.o gate.lds gate.o - -CPPFLAGS_gate.lds := -P -C -U$(ARCH) - -quiet_cmd_gate = GATE $@ - cmd_gate = $(CC) -nostdlib $(GATECFLAGS_$(@F)) -Wl,-T,$(filter-out FORCE,$^) -o $@ - -GATECFLAGS_gate.so = -shared -s -Wl,-soname=linux-gate.so.1 \ - $(call ld-option, -Wl$(comma)--hash-style=sysv) -$(obj)/gate.so: $(obj)/gate.lds $(obj)/gate.o FORCE - $(call if_changed,gate) - -$(obj)/built-in.o: $(obj)/gate-syms.o -$(obj)/built-in.o: ld_flags += -R $(obj)/gate-syms.o - -GATECFLAGS_gate-syms.o = -r -$(obj)/gate-syms.o: $(obj)/gate.lds $(obj)/gate.o FORCE - $(call if_changed,gate) - -# gate-data.o contains the gate DSO image as data in section .data.gate. -# We must build gate.so before we can assemble it. -# Note: kbuild does not track this dependency due to usage of .incbin -$(obj)/gate-data.o: $(obj)/gate.so diff --git a/trunk/arch/ia64/kernel/acpi.c b/trunk/arch/ia64/kernel/acpi.c index 5510317db37b..bdef2ce38c8b 100644 --- a/trunk/arch/ia64/kernel/acpi.c +++ b/trunk/arch/ia64/kernel/acpi.c @@ -890,7 +890,7 @@ __init void prefill_possible_map(void) possible, max((possible - available_cpus), 0)); for (i = 0; i < possible; i++) - set_cpu_possible(i, true); + cpu_set(i, cpu_possible_map); } int acpi_map_lsapic(acpi_handle handle, int *pcpu) @@ -928,9 +928,9 @@ int acpi_map_lsapic(acpi_handle handle, int *pcpu) buffer.length = ACPI_ALLOCATE_BUFFER; buffer.pointer = NULL; - cpumask_complement(&tmp_map, cpu_present_mask); - cpu = cpumask_first(&tmp_map); - if (cpu >= nr_cpu_ids) + cpus_complement(tmp_map, cpu_present_map); + cpu = first_cpu(tmp_map); + if (cpu >= NR_CPUS) return -EINVAL; acpi_map_cpu2node(handle, cpu, physid); diff --git a/trunk/arch/ia64/kernel/asm-offsets.c b/trunk/arch/ia64/kernel/asm-offsets.c index af5650169043..742dbb1d5a4f 100644 --- a/trunk/arch/ia64/kernel/asm-offsets.c +++ b/trunk/arch/ia64/kernel/asm-offsets.c @@ -316,7 +316,5 @@ void foo(void) DEFINE_MAPPED_REG_OFS(XSI_BANK1_R16_OFS, bank1_regs[0]); DEFINE_MAPPED_REG_OFS(XSI_B0NATS_OFS, vbnat); DEFINE_MAPPED_REG_OFS(XSI_B1NATS_OFS, vnat); - DEFINE_MAPPED_REG_OFS(XSI_ITC_OFFSET_OFS, itc_offset); - DEFINE_MAPPED_REG_OFS(XSI_ITC_LAST_OFS, itc_last); #endif /* CONFIG_XEN */ } diff --git a/trunk/arch/ia64/kernel/efi.c b/trunk/arch/ia64/kernel/efi.c index 7ef80e8161ce..efaff15d8cf1 100644 --- a/trunk/arch/ia64/kernel/efi.c +++ b/trunk/arch/ia64/kernel/efi.c @@ -456,7 +456,6 @@ efi_map_pal_code (void) GRANULEROUNDDOWN((unsigned long) pal_vaddr), pte_val(pfn_pte(__pa(pal_vaddr) >> PAGE_SHIFT, PAGE_KERNEL)), IA64_GRANULE_SHIFT); - paravirt_dv_serialize_data(); ia64_set_psr(psr); /* restore psr */ } diff --git a/trunk/arch/ia64/kernel/entry.S b/trunk/arch/ia64/kernel/entry.S index ccfdeee9d89f..e5341e2c1175 100644 --- a/trunk/arch/ia64/kernel/entry.S +++ b/trunk/arch/ia64/kernel/entry.S @@ -735,7 +735,7 @@ GLOBAL_ENTRY(__paravirt_leave_syscall) __paravirt_work_processed_syscall: #ifdef CONFIG_VIRT_CPU_ACCOUNTING adds r2=PT(LOADRS)+16,r12 - MOV_FROM_ITC(pUStk, p9, r22, r19) // fetch time at leave +(pUStk) mov.m r22=ar.itc // fetch time at leave adds r18=TI_FLAGS+IA64_TASK_SIZE,r13 ;; (p6) ld4 r31=[r18] // load current_thread_info()->flags @@ -984,7 +984,7 @@ GLOBAL_ENTRY(__paravirt_leave_kernel) #ifdef CONFIG_VIRT_CPU_ACCOUNTING .pred.rel.mutex pUStk,pKStk MOV_FROM_PSR(pKStk, r22, r29) // M2 read PSR now that interrupts are disabled - MOV_FROM_ITC(pUStk, p9, r22, r29) // M fetch time at leave +(pUStk) mov.m r22=ar.itc // M fetch time at leave nop.i 0 ;; #else diff --git a/trunk/arch/ia64/kernel/fsys.S b/trunk/arch/ia64/kernel/fsys.S index 3567d54f8cee..c1625c7e1779 100644 --- a/trunk/arch/ia64/kernel/fsys.S +++ b/trunk/arch/ia64/kernel/fsys.S @@ -25,7 +25,6 @@ #include #include "entry.h" -#include "paravirt_inst.h" /* * See Documentation/ia64/fsys.txt for details on fsyscalls. @@ -280,7 +279,7 @@ ENTRY(fsys_gettimeofday) (p9) cmp.eq p13,p0 = 0,r30 // if mmio_ptr, clear p13 jitter control ;; .pred.rel.mutex p8,p9 - MOV_FROM_ITC(p8, p6, r2, r10) // CPU_TIMER. 36 clocks latency!!! +(p8) mov r2 = ar.itc // CPU_TIMER. 36 clocks latency!!! (p9) ld8 r2 = [r30] // MMIO_TIMER. Could also have latency issues.. (p13) ld8 r25 = [r19] // get itc_lastcycle value ld8 r9 = [r22],IA64_TIMESPEC_TV_NSEC_OFFSET // tv_sec @@ -419,7 +418,7 @@ EX(.fail_efault, ld8 r14=[r33]) // r14 <- *set mov r17=(1 << (SIGKILL - 1)) | (1 << (SIGSTOP - 1)) ;; - RSM_PSR_I(p0, r18, r19) // mask interrupt delivery + rsm psr.i // mask interrupt delivery mov ar.ccv=0 andcm r14=r14,r17 // filter out SIGKILL & SIGSTOP @@ -492,7 +491,7 @@ EX(.fail_efault, ld8 r14=[r33]) // r14 <- *set #ifdef CONFIG_SMP st4.rel [r31]=r0 // release the lock #endif - SSM_PSR_I(p0, p9, r31) + ssm psr.i ;; srlz.d // ensure psr.i is set again @@ -514,7 +513,7 @@ EX(.fail_efault, (p15) st8 [r34]=r3) #ifdef CONFIG_SMP st4.rel [r31]=r0 // release the lock #endif - SSM_PSR_I(p0, p9, r17) + ssm psr.i ;; srlz.d br.sptk.many fsys_fallback_syscall // with signal pending, do the heavy-weight syscall @@ -522,7 +521,7 @@ EX(.fail_efault, (p15) st8 [r34]=r3) #ifdef CONFIG_SMP .lock_contention: /* Rather than spinning here, fall back on doing a heavy-weight syscall. */ - SSM_PSR_I(p0, p9, r17) + ssm psr.i ;; srlz.d br.sptk.many fsys_fallback_syscall @@ -593,17 +592,17 @@ ENTRY(fsys_fallback_syscall) adds r17=-1024,r15 movl r14=sys_call_table ;; - RSM_PSR_I(p0, r26, r27) + rsm psr.i shladd r18=r17,3,r14 ;; ld8 r18=[r18] // load normal (heavy-weight) syscall entry-point - MOV_FROM_PSR(p0, r29, r26) // read psr (12 cyc load latency) + mov r29=psr // read psr (12 cyc load latency) mov r27=ar.rsc mov r21=ar.fpsr mov r26=ar.pfs END(fsys_fallback_syscall) /* FALL THROUGH */ -GLOBAL_ENTRY(paravirt_fsys_bubble_down) +GLOBAL_ENTRY(fsys_bubble_down) .prologue .altrp b6 .body @@ -641,7 +640,7 @@ GLOBAL_ENTRY(paravirt_fsys_bubble_down) * * PSR.BE : already is turned off in __kernel_syscall_via_epc() * PSR.AC : don't care (kernel normally turns PSR.AC on) - * PSR.I : already turned off by the time paravirt_fsys_bubble_down gets + * PSR.I : already turned off by the time fsys_bubble_down gets * invoked * PSR.DFL: always 0 (kernel never turns it on) * PSR.DFH: don't care --- kernel never touches f32-f127 on its own @@ -651,7 +650,7 @@ GLOBAL_ENTRY(paravirt_fsys_bubble_down) * PSR.DB : don't care --- kernel never enables kernel-level * breakpoints * PSR.TB : must be 0 already; if it wasn't zero on entry to - * __kernel_syscall_via_epc, the branch to paravirt_fsys_bubble_down + * __kernel_syscall_via_epc, the branch to fsys_bubble_down * will trigger a taken branch; the taken-trap-handler then * converts the syscall into a break-based system-call. */ @@ -684,7 +683,7 @@ GLOBAL_ENTRY(paravirt_fsys_bubble_down) ;; mov ar.rsc=0 // M2 set enforced lazy mode, pl 0, LE, loadrs=0 #ifdef CONFIG_VIRT_CPU_ACCOUNTING - MOV_FROM_ITC(p0, p6, r30, r23) // M get cycle for accounting + mov.m r30=ar.itc // M get cycle for accounting #else nop.m 0 #endif @@ -735,21 +734,21 @@ GLOBAL_ENTRY(paravirt_fsys_bubble_down) mov rp=r14 // I0 set the real return addr and r3=_TIF_SYSCALL_TRACEAUDIT,r3 // A ;; - SSM_PSR_I(p0, p6, r22) // M2 we're on kernel stacks now, reenable irqs + ssm psr.i // M2 we're on kernel stacks now, reenable irqs cmp.eq p8,p0=r3,r0 // A (p10) br.cond.spnt.many ia64_ret_from_syscall // B return if bad call-frame or r15 is a NaT nop.m 0 (p8) br.call.sptk.many b6=b6 // B (ignore return address) br.cond.spnt ia64_trace_syscall // B -END(paravirt_fsys_bubble_down) +END(fsys_bubble_down) .rodata .align 8 - .globl paravirt_fsyscall_table + .globl fsyscall_table - data8 paravirt_fsys_bubble_down -paravirt_fsyscall_table: + data8 fsys_bubble_down +fsyscall_table: data8 fsys_ni_syscall data8 0 // exit // 1025 data8 0 // read @@ -1034,4 +1033,4 @@ paravirt_fsyscall_table: // fill in zeros for the remaining entries .zero: - .space paravirt_fsyscall_table + 8*NR_syscalls - .zero, 0 + .space fsyscall_table + 8*NR_syscalls - .zero, 0 diff --git a/trunk/arch/ia64/kernel/gate.S b/trunk/arch/ia64/kernel/gate.S index cf5e0a105e16..74b1ccce4e84 100644 --- a/trunk/arch/ia64/kernel/gate.S +++ b/trunk/arch/ia64/kernel/gate.S @@ -13,7 +13,6 @@ #include #include #include -#include "paravirt_inst.h" /* * We can't easily refer to symbols inside the kernel. To avoid full runtime relocation, @@ -49,6 +48,87 @@ GLOBAL_ENTRY(__kernel_syscall_via_break) } END(__kernel_syscall_via_break) +/* + * On entry: + * r11 = saved ar.pfs + * r15 = system call # + * b0 = saved return address + * b6 = return address + * On exit: + * r11 = saved ar.pfs + * r15 = system call # + * b0 = saved return address + * all other "scratch" registers: undefined + * all "preserved" registers: same as on entry + */ + +GLOBAL_ENTRY(__kernel_syscall_via_epc) + .prologue + .altrp b6 + .body +{ + /* + * Note: the kernel cannot assume that the first two instructions in this + * bundle get executed. The remaining code must be safe even if + * they do not get executed. + */ + adds r17=-1024,r15 // A + mov r10=0 // A default to successful syscall execution + epc // B causes split-issue +} + ;; + rsm psr.be | psr.i // M2 (5 cyc to srlz.d) + LOAD_FSYSCALL_TABLE(r14) // X + ;; + mov r16=IA64_KR(CURRENT) // M2 (12 cyc) + shladd r18=r17,3,r14 // A + mov r19=NR_syscalls-1 // A + ;; + lfetch [r18] // M0|1 + mov r29=psr // M2 (12 cyc) + // If r17 is a NaT, p6 will be zero + cmp.geu p6,p7=r19,r17 // A (sysnr > 0 && sysnr < 1024+NR_syscalls)? + ;; + mov r21=ar.fpsr // M2 (12 cyc) + tnat.nz p10,p9=r15 // I0 + mov.i r26=ar.pfs // I0 (would stall anyhow due to srlz.d...) + ;; + srlz.d // M0 (forces split-issue) ensure PSR.BE==0 +(p6) ld8 r18=[r18] // M0|1 + nop.i 0 + ;; + nop.m 0 +(p6) tbit.z.unc p8,p0=r18,0 // I0 (dual-issues with "mov b7=r18"!) + nop.i 0 + ;; +(p8) ssm psr.i +(p6) mov b7=r18 // I0 +(p8) br.dptk.many b7 // B + + mov r27=ar.rsc // M2 (12 cyc) +/* + * brl.cond doesn't work as intended because the linker would convert this branch + * into a branch to a PLT. Perhaps there will be a way to avoid this with some + * future version of the linker. In the meantime, we just use an indirect branch + * instead. + */ +#ifdef CONFIG_ITANIUM +(p6) add r14=-8,r14 // r14 <- addr of fsys_bubble_down entry + ;; +(p6) ld8 r14=[r14] // r14 <- fsys_bubble_down + ;; +(p6) mov b7=r14 +(p6) br.sptk.many b7 +#else + BRL_COND_FSYS_BUBBLE_DOWN(p6) +#endif + ssm psr.i + mov r10=-1 +(p10) mov r8=EINVAL +(p9) mov r8=ENOSYS + FSYS_RETURN +END(__kernel_syscall_via_epc) + # define ARG0_OFF (16 + IA64_SIGFRAME_ARG0_OFFSET) # define ARG1_OFF (16 + IA64_SIGFRAME_ARG1_OFFSET) # define ARG2_OFF (16 + IA64_SIGFRAME_ARG2_OFFSET) @@ -294,92 +374,3 @@ restore_rbs: // invala not necessary as that will happen when returning to user-mode br.cond.sptk back_from_restore_rbs END(__kernel_sigtramp) - -/* - * On entry: - * r11 = saved ar.pfs - * r15 = system call # - * b0 = saved return address - * b6 = return address - * On exit: - * r11 = saved ar.pfs - * r15 = system call # - * b0 = saved return address - * all other "scratch" registers: undefined - * all "preserved" registers: same as on entry - */ - -GLOBAL_ENTRY(__kernel_syscall_via_epc) - .prologue - .altrp b6 - .body -{ - /* - * Note: the kernel cannot assume that the first two instructions in this - * bundle get executed. The remaining code must be safe even if - * they do not get executed. - */ - adds r17=-1024,r15 // A - mov r10=0 // A default to successful syscall execution - epc // B causes split-issue -} - ;; - RSM_PSR_BE_I(r20, r22) // M2 (5 cyc to srlz.d) - LOAD_FSYSCALL_TABLE(r14) // X - ;; - mov r16=IA64_KR(CURRENT) // M2 (12 cyc) - shladd r18=r17,3,r14 // A - mov r19=NR_syscalls-1 // A - ;; - lfetch [r18] // M0|1 - MOV_FROM_PSR(p0, r29, r8) // M2 (12 cyc) - // If r17 is a NaT, p6 will be zero - cmp.geu p6,p7=r19,r17 // A (sysnr > 0 && sysnr < 1024+NR_syscalls)? - ;; - mov r21=ar.fpsr // M2 (12 cyc) - tnat.nz p10,p9=r15 // I0 - mov.i r26=ar.pfs // I0 (would stall anyhow due to srlz.d...) - ;; - srlz.d // M0 (forces split-issue) ensure PSR.BE==0 -(p6) ld8 r18=[r18] // M0|1 - nop.i 0 - ;; - nop.m 0 -(p6) tbit.z.unc p8,p0=r18,0 // I0 (dual-issues with "mov b7=r18"!) - nop.i 0 - ;; - SSM_PSR_I(p8, p14, r25) -(p6) mov b7=r18 // I0 -(p8) br.dptk.many b7 // B - - mov r27=ar.rsc // M2 (12 cyc) -/* - * brl.cond doesn't work as intended because the linker would convert this branch - * into a branch to a PLT. Perhaps there will be a way to avoid this with some - * future version of the linker. In the meantime, we just use an indirect branch - * instead. - */ -#ifdef CONFIG_ITANIUM -(p6) add r14=-8,r14 // r14 <- addr of fsys_bubble_down entry - ;; -(p6) ld8 r14=[r14] // r14 <- fsys_bubble_down - ;; -(p6) mov b7=r14 -(p6) br.sptk.many b7 -#else - BRL_COND_FSYS_BUBBLE_DOWN(p6) -#endif - SSM_PSR_I(p0, p14, r10) - mov r10=-1 -(p10) mov r8=EINVAL -(p9) mov r8=ENOSYS - FSYS_RETURN - -#ifdef CONFIG_PARAVIRT - /* - * padd to make the size of this symbol constant - * independent of paravirtualization. - */ - .align PAGE_SIZE / 8 -#endif -END(__kernel_syscall_via_epc) diff --git a/trunk/arch/ia64/kernel/gate.lds.S b/trunk/arch/ia64/kernel/gate.lds.S index 88c64ed47c36..3cb1abc00e24 100644 --- a/trunk/arch/ia64/kernel/gate.lds.S +++ b/trunk/arch/ia64/kernel/gate.lds.S @@ -7,7 +7,6 @@ #include -#include "paravirt_patchlist.h" SECTIONS { @@ -34,21 +33,21 @@ SECTIONS . = GATE_ADDR + 0x600; .data.patch : { - __paravirt_start_gate_mckinley_e9_patchlist = .; + __start_gate_mckinley_e9_patchlist = .; *(.data.patch.mckinley_e9) - __paravirt_end_gate_mckinley_e9_patchlist = .; + __end_gate_mckinley_e9_patchlist = .; - __paravirt_start_gate_vtop_patchlist = .; + __start_gate_vtop_patchlist = .; *(.data.patch.vtop) - __paravirt_end_gate_vtop_patchlist = .; + __end_gate_vtop_patchlist = .; - __paravirt_start_gate_fsyscall_patchlist = .; + __start_gate_fsyscall_patchlist = .; *(.data.patch.fsyscall_table) - __paravirt_end_gate_fsyscall_patchlist = .; + __end_gate_fsyscall_patchlist = .; - __paravirt_start_gate_brl_fsys_bubble_down_patchlist = .; + __start_gate_brl_fsys_bubble_down_patchlist = .; *(.data.patch.brl_fsys_bubble_down) - __paravirt_end_gate_brl_fsys_bubble_down_patchlist = .; + __end_gate_brl_fsys_bubble_down_patchlist = .; } :readable .IA_64.unwind_info : { *(.IA_64.unwind_info*) } diff --git a/trunk/arch/ia64/kernel/head.S b/trunk/arch/ia64/kernel/head.S index 23f846de62d5..59301c472800 100644 --- a/trunk/arch/ia64/kernel/head.S +++ b/trunk/arch/ia64/kernel/head.S @@ -1050,7 +1050,7 @@ END(ia64_delay_loop) * except that the multiplication and the shift are done with 128-bit * intermediate precision so that we can produce a full 64-bit result. */ -GLOBAL_ENTRY(ia64_native_sched_clock) +GLOBAL_ENTRY(sched_clock) addl r8=THIS_CPU(cpu_info) + IA64_CPUINFO_NSEC_PER_CYC_OFFSET,r0 mov.m r9=ar.itc // fetch cycle-counter (35 cyc) ;; @@ -1066,13 +1066,7 @@ GLOBAL_ENTRY(ia64_native_sched_clock) ;; shrp r8=r9,r8,IA64_NSEC_PER_CYC_SHIFT br.ret.sptk.many rp -END(ia64_native_sched_clock) -#ifndef CONFIG_PARAVIRT - //unsigned long long - //sched_clock(void) __attribute__((alias("ia64_native_sched_clock"))); - .global sched_clock -sched_clock = ia64_native_sched_clock -#endif +END(sched_clock) #ifdef CONFIG_VIRT_CPU_ACCOUNTING GLOBAL_ENTRY(cycle_to_cputime) diff --git a/trunk/arch/ia64/kernel/ivt.S b/trunk/arch/ia64/kernel/ivt.S index ec9a5fdfa1b9..f675d8e33853 100644 --- a/trunk/arch/ia64/kernel/ivt.S +++ b/trunk/arch/ia64/kernel/ivt.S @@ -804,7 +804,7 @@ ENTRY(break_fault) /////////////////////////////////////////////////////////////////////// st1 [r16]=r0 // M2|3 clear current->thread.on_ustack flag #ifdef CONFIG_VIRT_CPU_ACCOUNTING - MOV_FROM_ITC(p0, p14, r30, r18) // M get cycle for accounting + mov.m r30=ar.itc // M get cycle for accounting #else mov b6=r30 // I0 setup syscall handler branch reg early #endif diff --git a/trunk/arch/ia64/kernel/mca.c b/trunk/arch/ia64/kernel/mca.c index 8f33a8840422..bab1de2d2f6a 100644 --- a/trunk/arch/ia64/kernel/mca.c +++ b/trunk/arch/ia64/kernel/mca.c @@ -1456,9 +1456,9 @@ ia64_mca_cmc_int_caller(int cmc_irq, void *arg) ia64_mca_cmc_int_handler(cmc_irq, arg); - cpuid = cpumask_next(cpuid+1, cpu_online_mask); + for (++cpuid ; cpuid < NR_CPUS && !cpu_online(cpuid) ; cpuid++); - if (cpuid < nr_cpu_ids) { + if (cpuid < NR_CPUS) { platform_send_ipi(cpuid, IA64_CMCP_VECTOR, IA64_IPI_DM_INT, 0); } else { /* If no log record, switch out of polling mode */ @@ -1525,7 +1525,7 @@ ia64_mca_cpe_int_caller(int cpe_irq, void *arg) ia64_mca_cpe_int_handler(cpe_irq, arg); - cpuid = cpumask_next(cpuid+1, cpu_online_mask); + for (++cpuid ; cpuid < NR_CPUS && !cpu_online(cpuid) ; cpuid++); if (cpuid < NR_CPUS) { platform_send_ipi(cpuid, IA64_CPEP_VECTOR, IA64_IPI_DM_INT, 0); diff --git a/trunk/arch/ia64/kernel/module.c b/trunk/arch/ia64/kernel/module.c index da3b0cf495a3..aaa7d901521f 100644 --- a/trunk/arch/ia64/kernel/module.c +++ b/trunk/arch/ia64/kernel/module.c @@ -446,14 +446,6 @@ module_frob_arch_sections (Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, char *secstrings, mod->arch.opd = s; else if (strcmp(".IA_64.unwind", secstrings + s->sh_name) == 0) mod->arch.unwind = s; -#ifdef CONFIG_PARAVIRT - else if (strcmp(".paravirt_bundles", - secstrings + s->sh_name) == 0) - mod->arch.paravirt_bundles = s; - else if (strcmp(".paravirt_insts", - secstrings + s->sh_name) == 0) - mod->arch.paravirt_insts = s; -#endif if (!mod->arch.core_plt || !mod->arch.init_plt || !mod->arch.got || !mod->arch.opd) { printk(KERN_ERR "%s: sections missing\n", mod->name); @@ -533,7 +525,8 @@ get_ltoff (struct module *mod, uint64_t value, int *okp) goto found; /* Not enough GOT entries? */ - BUG_ON(e >= (struct got_entry *) (mod->arch.got->sh_addr + mod->arch.got->sh_size)); + if (e >= (struct got_entry *) (mod->arch.got->sh_addr + mod->arch.got->sh_size)) + BUG(); e->val = value; ++mod->arch.next_got_entry; @@ -928,30 +921,6 @@ module_finalize (const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *mo DEBUGP("%s: init: entry=%p\n", __func__, mod->init); if (mod->arch.unwind) register_unwind_table(mod); -#ifdef CONFIG_PARAVIRT - if (mod->arch.paravirt_bundles) { - struct paravirt_patch_site_bundle *start = - (struct paravirt_patch_site_bundle *) - mod->arch.paravirt_bundles->sh_addr; - struct paravirt_patch_site_bundle *end = - (struct paravirt_patch_site_bundle *) - (mod->arch.paravirt_bundles->sh_addr + - mod->arch.paravirt_bundles->sh_size); - - paravirt_patch_apply_bundle(start, end); - } - if (mod->arch.paravirt_insts) { - struct paravirt_patch_site_inst *start = - (struct paravirt_patch_site_inst *) - mod->arch.paravirt_insts->sh_addr; - struct paravirt_patch_site_inst *end = - (struct paravirt_patch_site_inst *) - (mod->arch.paravirt_insts->sh_addr + - mod->arch.paravirt_insts->sh_size); - - paravirt_patch_apply_inst(start, end); - } -#endif return 0; } diff --git a/trunk/arch/ia64/kernel/paravirt.c b/trunk/arch/ia64/kernel/paravirt.c index a21d7bb9c69c..9f14c16f6369 100644 --- a/trunk/arch/ia64/kernel/paravirt.c +++ b/trunk/arch/ia64/kernel/paravirt.c @@ -46,23 +46,13 @@ struct pv_info pv_info = { * initialization hooks. */ -static void __init -ia64_native_patch_branch(unsigned long tag, unsigned long type); - -struct pv_init_ops pv_init_ops = -{ -#ifdef ASM_SUPPORTED - .patch_bundle = ia64_native_patch_bundle, -#endif - .patch_branch = ia64_native_patch_branch, -}; +struct pv_init_ops pv_init_ops; /*************************************************************************** * pv_cpu_ops * intrinsics hooks. */ -#ifndef ASM_SUPPORTED /* ia64_native_xxx are macros so that we have to make them real functions */ #define DEFINE_VOID_FUNC1(name) \ @@ -70,14 +60,7 @@ struct pv_init_ops pv_init_ops = ia64_native_ ## name ## _func(unsigned long arg) \ { \ ia64_native_ ## name(arg); \ - } - -#define DEFINE_VOID_FUNC1_VOID(name) \ - static void \ - ia64_native_ ## name ## _func(void *arg) \ - { \ - ia64_native_ ## name(arg); \ - } + } \ #define DEFINE_VOID_FUNC2(name) \ static void \ @@ -85,7 +68,7 @@ struct pv_init_ops pv_init_ops = unsigned long arg1) \ { \ ia64_native_ ## name(arg0, arg1); \ - } + } \ #define DEFINE_FUNC0(name) \ static unsigned long \ @@ -101,7 +84,7 @@ struct pv_init_ops pv_init_ops = return ia64_native_ ## name(arg); \ } \ -DEFINE_VOID_FUNC1_VOID(fc); +DEFINE_VOID_FUNC1(fc); DEFINE_VOID_FUNC1(intrin_local_irq_restore); DEFINE_VOID_FUNC2(ptcga); @@ -291,266 +274,6 @@ ia64_native_setreg_func(int regnum, unsigned long val) break; } } -#else - -#define __DEFINE_FUNC(name, code) \ - extern const char ia64_native_ ## name ## _direct_start[]; \ - extern const char ia64_native_ ## name ## _direct_end[]; \ - asm (".align 32\n" \ - ".proc ia64_native_" #name "_func\n" \ - "ia64_native_" #name "_func:\n" \ - "ia64_native_" #name "_direct_start:\n" \ - code \ - "ia64_native_" #name "_direct_end:\n" \ - "br.cond.sptk.many b6\n" \ - ".endp ia64_native_" #name "_func\n") - -#define DEFINE_VOID_FUNC0(name, code) \ - extern void \ - ia64_native_ ## name ## _func(void); \ - __DEFINE_FUNC(name, code) - -#define DEFINE_VOID_FUNC1(name, code) \ - extern void \ - ia64_native_ ## name ## _func(unsigned long arg); \ - __DEFINE_FUNC(name, code) - -#define DEFINE_VOID_FUNC1_VOID(name, code) \ - extern void \ - ia64_native_ ## name ## _func(void *arg); \ - __DEFINE_FUNC(name, code) - -#define DEFINE_VOID_FUNC2(name, code) \ - extern void \ - ia64_native_ ## name ## _func(unsigned long arg0, \ - unsigned long arg1); \ - __DEFINE_FUNC(name, code) - -#define DEFINE_FUNC0(name, code) \ - extern unsigned long \ - ia64_native_ ## name ## _func(void); \ - __DEFINE_FUNC(name, code) - -#define DEFINE_FUNC1(name, type, code) \ - extern unsigned long \ - ia64_native_ ## name ## _func(type arg); \ - __DEFINE_FUNC(name, code) - -DEFINE_VOID_FUNC1_VOID(fc, - "fc r8\n"); -DEFINE_VOID_FUNC1(intrin_local_irq_restore, - ";;\n" - " cmp.ne p6, p7 = r8, r0\n" - ";;\n" - "(p6) ssm psr.i\n" - "(p7) rsm psr.i\n" - ";;\n" - "(p6) srlz.d\n"); - -DEFINE_VOID_FUNC2(ptcga, - "ptc.ga r8, r9\n"); -DEFINE_VOID_FUNC2(set_rr, - "mov rr[r8] = r9\n"); - -/* ia64_native_getreg(_IA64_REG_PSR) & IA64_PSR_I */ -DEFINE_FUNC0(get_psr_i, - "mov r2 = " __stringify(1 << IA64_PSR_I_BIT) "\n" - "mov r8 = psr\n" - ";;\n" - "and r8 = r2, r8\n"); - -DEFINE_FUNC1(thash, unsigned long, - "thash r8 = r8\n"); -DEFINE_FUNC1(get_cpuid, int, - "mov r8 = cpuid[r8]\n"); -DEFINE_FUNC1(get_pmd, int, - "mov r8 = pmd[r8]\n"); -DEFINE_FUNC1(get_rr, unsigned long, - "mov r8 = rr[r8]\n"); - -DEFINE_VOID_FUNC0(ssm_i, - "ssm psr.i\n"); -DEFINE_VOID_FUNC0(rsm_i, - "rsm psr.i\n"); - -extern void -ia64_native_set_rr0_to_rr4_func(unsigned long val0, unsigned long val1, - unsigned long val2, unsigned long val3, - unsigned long val4); -__DEFINE_FUNC(set_rr0_to_rr4, - "mov rr[r0] = r8\n" - "movl r2 = 0x2000000000000000\n" - ";;\n" - "mov rr[r2] = r9\n" - "shl r3 = r2, 1\n" /* movl r3 = 0x4000000000000000 */ - ";;\n" - "add r2 = r2, r3\n" /* movl r2 = 0x6000000000000000 */ - "mov rr[r3] = r10\n" - ";;\n" - "mov rr[r2] = r11\n" - "shl r3 = r3, 1\n" /* movl r3 = 0x8000000000000000 */ - ";;\n" - "mov rr[r3] = r14\n"); - -extern unsigned long ia64_native_getreg_func(int regnum); -asm(".global ia64_native_getreg_func\n"); -#define __DEFINE_GET_REG(id, reg) \ - "mov r2 = " __stringify(_IA64_REG_ ## id) "\n" \ - ";;\n" \ - "cmp.eq p6, p0 = r2, r8\n" \ - ";;\n" \ - "(p6) mov r8 = " #reg "\n" \ - "(p6) br.cond.sptk.many b6\n" \ - ";;\n" -#define __DEFINE_GET_AR(id, reg) __DEFINE_GET_REG(AR_ ## id, ar.reg) -#define __DEFINE_GET_CR(id, reg) __DEFINE_GET_REG(CR_ ## id, cr.reg) - -__DEFINE_FUNC(getreg, - __DEFINE_GET_REG(GP, gp) - /*__DEFINE_GET_REG(IP, ip)*/ /* returned ip value shouldn't be constant */ - __DEFINE_GET_REG(PSR, psr) - __DEFINE_GET_REG(TP, tp) - __DEFINE_GET_REG(SP, sp) - - __DEFINE_GET_REG(AR_KR0, ar0) - __DEFINE_GET_REG(AR_KR1, ar1) - __DEFINE_GET_REG(AR_KR2, ar2) - __DEFINE_GET_REG(AR_KR3, ar3) - __DEFINE_GET_REG(AR_KR4, ar4) - __DEFINE_GET_REG(AR_KR5, ar5) - __DEFINE_GET_REG(AR_KR6, ar6) - __DEFINE_GET_REG(AR_KR7, ar7) - __DEFINE_GET_AR(RSC, rsc) - __DEFINE_GET_AR(BSP, bsp) - __DEFINE_GET_AR(BSPSTORE, bspstore) - __DEFINE_GET_AR(RNAT, rnat) - __DEFINE_GET_AR(FCR, fcr) - __DEFINE_GET_AR(EFLAG, eflag) - __DEFINE_GET_AR(CSD, csd) - __DEFINE_GET_AR(SSD, ssd) - __DEFINE_GET_REG(AR_CFLAG, ar27) - __DEFINE_GET_AR(FSR, fsr) - __DEFINE_GET_AR(FIR, fir) - __DEFINE_GET_AR(FDR, fdr) - __DEFINE_GET_AR(CCV, ccv) - __DEFINE_GET_AR(UNAT, unat) - __DEFINE_GET_AR(FPSR, fpsr) - __DEFINE_GET_AR(ITC, itc) - __DEFINE_GET_AR(PFS, pfs) - __DEFINE_GET_AR(LC, lc) - __DEFINE_GET_AR(EC, ec) - - __DEFINE_GET_CR(DCR, dcr) - __DEFINE_GET_CR(ITM, itm) - __DEFINE_GET_CR(IVA, iva) - __DEFINE_GET_CR(PTA, pta) - __DEFINE_GET_CR(IPSR, ipsr) - __DEFINE_GET_CR(ISR, isr) - __DEFINE_GET_CR(IIP, iip) - __DEFINE_GET_CR(IFA, ifa) - __DEFINE_GET_CR(ITIR, itir) - __DEFINE_GET_CR(IIPA, iipa) - __DEFINE_GET_CR(IFS, ifs) - __DEFINE_GET_CR(IIM, iim) - __DEFINE_GET_CR(IHA, iha) - __DEFINE_GET_CR(LID, lid) - __DEFINE_GET_CR(IVR, ivr) - __DEFINE_GET_CR(TPR, tpr) - __DEFINE_GET_CR(EOI, eoi) - __DEFINE_GET_CR(IRR0, irr0) - __DEFINE_GET_CR(IRR1, irr1) - __DEFINE_GET_CR(IRR2, irr2) - __DEFINE_GET_CR(IRR3, irr3) - __DEFINE_GET_CR(ITV, itv) - __DEFINE_GET_CR(PMV, pmv) - __DEFINE_GET_CR(CMCV, cmcv) - __DEFINE_GET_CR(LRR0, lrr0) - __DEFINE_GET_CR(LRR1, lrr1) - - "mov r8 = -1\n" /* unsupported case */ - ); - -extern void ia64_native_setreg_func(int regnum, unsigned long val); -asm(".global ia64_native_setreg_func\n"); -#define __DEFINE_SET_REG(id, reg) \ - "mov r2 = " __stringify(_IA64_REG_ ## id) "\n" \ - ";;\n" \ - "cmp.eq p6, p0 = r2, r9\n" \ - ";;\n" \ - "(p6) mov " #reg " = r8\n" \ - "(p6) br.cond.sptk.many b6\n" \ - ";;\n" -#define __DEFINE_SET_AR(id, reg) __DEFINE_SET_REG(AR_ ## id, ar.reg) -#define __DEFINE_SET_CR(id, reg) __DEFINE_SET_REG(CR_ ## id, cr.reg) -__DEFINE_FUNC(setreg, - "mov r2 = " __stringify(_IA64_REG_PSR_L) "\n" - ";;\n" - "cmp.eq p6, p0 = r2, r9\n" - ";;\n" - "(p6) mov psr.l = r8\n" -#ifdef HAVE_SERIALIZE_DIRECTIVE - ".serialize.data\n" -#endif - "(p6) br.cond.sptk.many b6\n" - __DEFINE_SET_REG(GP, gp) - __DEFINE_SET_REG(SP, sp) - - __DEFINE_SET_REG(AR_KR0, ar0) - __DEFINE_SET_REG(AR_KR1, ar1) - __DEFINE_SET_REG(AR_KR2, ar2) - __DEFINE_SET_REG(AR_KR3, ar3) - __DEFINE_SET_REG(AR_KR4, ar4) - __DEFINE_SET_REG(AR_KR5, ar5) - __DEFINE_SET_REG(AR_KR6, ar6) - __DEFINE_SET_REG(AR_KR7, ar7) - __DEFINE_SET_AR(RSC, rsc) - __DEFINE_SET_AR(BSP, bsp) - __DEFINE_SET_AR(BSPSTORE, bspstore) - __DEFINE_SET_AR(RNAT, rnat) - __DEFINE_SET_AR(FCR, fcr) - __DEFINE_SET_AR(EFLAG, eflag) - __DEFINE_SET_AR(CSD, csd) - __DEFINE_SET_AR(SSD, ssd) - __DEFINE_SET_REG(AR_CFLAG, ar27) - __DEFINE_SET_AR(FSR, fsr) - __DEFINE_SET_AR(FIR, fir) - __DEFINE_SET_AR(FDR, fdr) - __DEFINE_SET_AR(CCV, ccv) - __DEFINE_SET_AR(UNAT, unat) - __DEFINE_SET_AR(FPSR, fpsr) - __DEFINE_SET_AR(ITC, itc) - __DEFINE_SET_AR(PFS, pfs) - __DEFINE_SET_AR(LC, lc) - __DEFINE_SET_AR(EC, ec) - - __DEFINE_SET_CR(DCR, dcr) - __DEFINE_SET_CR(ITM, itm) - __DEFINE_SET_CR(IVA, iva) - __DEFINE_SET_CR(PTA, pta) - __DEFINE_SET_CR(IPSR, ipsr) - __DEFINE_SET_CR(ISR, isr) - __DEFINE_SET_CR(IIP, iip) - __DEFINE_SET_CR(IFA, ifa) - __DEFINE_SET_CR(ITIR, itir) - __DEFINE_SET_CR(IIPA, iipa) - __DEFINE_SET_CR(IFS, ifs) - __DEFINE_SET_CR(IIM, iim) - __DEFINE_SET_CR(IHA, iha) - __DEFINE_SET_CR(LID, lid) - __DEFINE_SET_CR(IVR, ivr) - __DEFINE_SET_CR(TPR, tpr) - __DEFINE_SET_CR(EOI, eoi) - __DEFINE_SET_CR(IRR0, irr0) - __DEFINE_SET_CR(IRR1, irr1) - __DEFINE_SET_CR(IRR2, irr2) - __DEFINE_SET_CR(IRR3, irr3) - __DEFINE_SET_CR(ITV, itv) - __DEFINE_SET_CR(PMV, pmv) - __DEFINE_SET_CR(CMCV, cmcv) - __DEFINE_SET_CR(LRR0, lrr0) - __DEFINE_SET_CR(LRR1, lrr1) - ); -#endif struct pv_cpu_ops pv_cpu_ops = { .fc = ia64_native_fc_func, @@ -643,258 +366,4 @@ ia64_native_do_steal_accounting(unsigned long *new_itm) struct pv_time_ops pv_time_ops = { .do_steal_accounting = ia64_native_do_steal_accounting, - .sched_clock = ia64_native_sched_clock, -}; - -/*************************************************************************** - * binary pacthing - * pv_init_ops.patch_bundle - */ - -#ifdef ASM_SUPPORTED -#define IA64_NATIVE_PATCH_DEFINE_GET_REG(name, reg) \ - __DEFINE_FUNC(get_ ## name, \ - ";;\n" \ - "mov r8 = " #reg "\n" \ - ";;\n") - -#define IA64_NATIVE_PATCH_DEFINE_SET_REG(name, reg) \ - __DEFINE_FUNC(set_ ## name, \ - ";;\n" \ - "mov " #reg " = r8\n" \ - ";;\n") - -#define IA64_NATIVE_PATCH_DEFINE_REG(name, reg) \ - IA64_NATIVE_PATCH_DEFINE_GET_REG(name, reg); \ - IA64_NATIVE_PATCH_DEFINE_SET_REG(name, reg) \ - -#define IA64_NATIVE_PATCH_DEFINE_AR(name, reg) \ - IA64_NATIVE_PATCH_DEFINE_REG(ar_ ## name, ar.reg) - -#define IA64_NATIVE_PATCH_DEFINE_CR(name, reg) \ - IA64_NATIVE_PATCH_DEFINE_REG(cr_ ## name, cr.reg) - - -IA64_NATIVE_PATCH_DEFINE_GET_REG(psr, psr); -IA64_NATIVE_PATCH_DEFINE_GET_REG(tp, tp); - -/* IA64_NATIVE_PATCH_DEFINE_SET_REG(psr_l, psr.l); */ -__DEFINE_FUNC(set_psr_l, - ";;\n" - "mov psr.l = r8\n" -#ifdef HAVE_SERIALIZE_DIRECTIVE - ".serialize.data\n" -#endif - ";;\n"); - -IA64_NATIVE_PATCH_DEFINE_REG(gp, gp); -IA64_NATIVE_PATCH_DEFINE_REG(sp, sp); - -IA64_NATIVE_PATCH_DEFINE_REG(kr0, ar0); -IA64_NATIVE_PATCH_DEFINE_REG(kr1, ar1); -IA64_NATIVE_PATCH_DEFINE_REG(kr2, ar2); -IA64_NATIVE_PATCH_DEFINE_REG(kr3, ar3); -IA64_NATIVE_PATCH_DEFINE_REG(kr4, ar4); -IA64_NATIVE_PATCH_DEFINE_REG(kr5, ar5); -IA64_NATIVE_PATCH_DEFINE_REG(kr6, ar6); -IA64_NATIVE_PATCH_DEFINE_REG(kr7, ar7); - -IA64_NATIVE_PATCH_DEFINE_AR(rsc, rsc); -IA64_NATIVE_PATCH_DEFINE_AR(bsp, bsp); -IA64_NATIVE_PATCH_DEFINE_AR(bspstore, bspstore); -IA64_NATIVE_PATCH_DEFINE_AR(rnat, rnat); -IA64_NATIVE_PATCH_DEFINE_AR(fcr, fcr); -IA64_NATIVE_PATCH_DEFINE_AR(eflag, eflag); -IA64_NATIVE_PATCH_DEFINE_AR(csd, csd); -IA64_NATIVE_PATCH_DEFINE_AR(ssd, ssd); -IA64_NATIVE_PATCH_DEFINE_REG(ar27, ar27); -IA64_NATIVE_PATCH_DEFINE_AR(fsr, fsr); -IA64_NATIVE_PATCH_DEFINE_AR(fir, fir); -IA64_NATIVE_PATCH_DEFINE_AR(fdr, fdr); -IA64_NATIVE_PATCH_DEFINE_AR(ccv, ccv); -IA64_NATIVE_PATCH_DEFINE_AR(unat, unat); -IA64_NATIVE_PATCH_DEFINE_AR(fpsr, fpsr); -IA64_NATIVE_PATCH_DEFINE_AR(itc, itc); -IA64_NATIVE_PATCH_DEFINE_AR(pfs, pfs); -IA64_NATIVE_PATCH_DEFINE_AR(lc, lc); -IA64_NATIVE_PATCH_DEFINE_AR(ec, ec); - -IA64_NATIVE_PATCH_DEFINE_CR(dcr, dcr); -IA64_NATIVE_PATCH_DEFINE_CR(itm, itm); -IA64_NATIVE_PATCH_DEFINE_CR(iva, iva); -IA64_NATIVE_PATCH_DEFINE_CR(pta, pta); -IA64_NATIVE_PATCH_DEFINE_CR(ipsr, ipsr); -IA64_NATIVE_PATCH_DEFINE_CR(isr, isr); -IA64_NATIVE_PATCH_DEFINE_CR(iip, iip); -IA64_NATIVE_PATCH_DEFINE_CR(ifa, ifa); -IA64_NATIVE_PATCH_DEFINE_CR(itir, itir); -IA64_NATIVE_PATCH_DEFINE_CR(iipa, iipa); -IA64_NATIVE_PATCH_DEFINE_CR(ifs, ifs); -IA64_NATIVE_PATCH_DEFINE_CR(iim, iim); -IA64_NATIVE_PATCH_DEFINE_CR(iha, iha); -IA64_NATIVE_PATCH_DEFINE_CR(lid, lid); -IA64_NATIVE_PATCH_DEFINE_CR(ivr, ivr); -IA64_NATIVE_PATCH_DEFINE_CR(tpr, tpr); -IA64_NATIVE_PATCH_DEFINE_CR(eoi, eoi); -IA64_NATIVE_PATCH_DEFINE_CR(irr0, irr0); -IA64_NATIVE_PATCH_DEFINE_CR(irr1, irr1); -IA64_NATIVE_PATCH_DEFINE_CR(irr2, irr2); -IA64_NATIVE_PATCH_DEFINE_CR(irr3, irr3); -IA64_NATIVE_PATCH_DEFINE_CR(itv, itv); -IA64_NATIVE_PATCH_DEFINE_CR(pmv, pmv); -IA64_NATIVE_PATCH_DEFINE_CR(cmcv, cmcv); -IA64_NATIVE_PATCH_DEFINE_CR(lrr0, lrr0); -IA64_NATIVE_PATCH_DEFINE_CR(lrr1, lrr1); - -static const struct paravirt_patch_bundle_elem ia64_native_patch_bundle_elems[] -__initdata_or_module = -{ -#define IA64_NATIVE_PATCH_BUNDLE_ELEM(name, type) \ - { \ - (void*)ia64_native_ ## name ## _direct_start, \ - (void*)ia64_native_ ## name ## _direct_end, \ - PARAVIRT_PATCH_TYPE_ ## type, \ - } - - IA64_NATIVE_PATCH_BUNDLE_ELEM(fc, FC), - IA64_NATIVE_PATCH_BUNDLE_ELEM(thash, THASH), - IA64_NATIVE_PATCH_BUNDLE_ELEM(get_cpuid, GET_CPUID), - IA64_NATIVE_PATCH_BUNDLE_ELEM(get_pmd, GET_PMD), - IA64_NATIVE_PATCH_BUNDLE_ELEM(ptcga, PTCGA), - IA64_NATIVE_PATCH_BUNDLE_ELEM(get_rr, GET_RR), - IA64_NATIVE_PATCH_BUNDLE_ELEM(set_rr, SET_RR), - IA64_NATIVE_PATCH_BUNDLE_ELEM(set_rr0_to_rr4, SET_RR0_TO_RR4), - IA64_NATIVE_PATCH_BUNDLE_ELEM(ssm_i, SSM_I), - IA64_NATIVE_PATCH_BUNDLE_ELEM(rsm_i, RSM_I), - IA64_NATIVE_PATCH_BUNDLE_ELEM(get_psr_i, GET_PSR_I), - IA64_NATIVE_PATCH_BUNDLE_ELEM(intrin_local_irq_restore, - INTRIN_LOCAL_IRQ_RESTORE), - -#define IA64_NATIVE_PATCH_BUNDLE_ELEM_GETREG(name, reg) \ - { \ - (void*)ia64_native_get_ ## name ## _direct_start, \ - (void*)ia64_native_get_ ## name ## _direct_end, \ - PARAVIRT_PATCH_TYPE_GETREG + _IA64_REG_ ## reg, \ - } - -#define IA64_NATIVE_PATCH_BUNDLE_ELEM_SETREG(name, reg) \ - { \ - (void*)ia64_native_set_ ## name ## _direct_start, \ - (void*)ia64_native_set_ ## name ## _direct_end, \ - PARAVIRT_PATCH_TYPE_SETREG + _IA64_REG_ ## reg, \ - } - -#define IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(name, reg) \ - IA64_NATIVE_PATCH_BUNDLE_ELEM_GETREG(name, reg), \ - IA64_NATIVE_PATCH_BUNDLE_ELEM_SETREG(name, reg) \ - -#define IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(name, reg) \ - IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(ar_ ## name, AR_ ## reg) - -#define IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(name, reg) \ - IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(cr_ ## name, CR_ ## reg) - - IA64_NATIVE_PATCH_BUNDLE_ELEM_GETREG(psr, PSR), - IA64_NATIVE_PATCH_BUNDLE_ELEM_GETREG(tp, TP), - - IA64_NATIVE_PATCH_BUNDLE_ELEM_SETREG(psr_l, PSR_L), - - IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(gp, GP), - IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(sp, SP), - - IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr0, AR_KR0), - IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr1, AR_KR1), - IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr2, AR_KR2), - IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr3, AR_KR3), - IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr4, AR_KR4), - IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr5, AR_KR5), - IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr6, AR_KR6), - IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr7, AR_KR7), - - IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(rsc, RSC), - IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(bsp, BSP), - IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(bspstore, BSPSTORE), - IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(rnat, RNAT), - IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(fcr, FCR), - IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(eflag, EFLAG), - IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(csd, CSD), - IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(ssd, SSD), - IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(ar27, AR_CFLAG), - IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(fsr, FSR), - IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(fir, FIR), - IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(fdr, FDR), - IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(ccv, CCV), - IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(unat, UNAT), - IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(fpsr, FPSR), - IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(itc, ITC), - IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(pfs, PFS), - IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(lc, LC), - IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(ec, EC), - - IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(dcr, DCR), - IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(itm, ITM), - IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(iva, IVA), - IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(pta, PTA), - IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(ipsr, IPSR), - IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(isr, ISR), - IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(iip, IIP), - IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(ifa, IFA), - IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(itir, ITIR), - IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(iipa, IIPA), - IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(ifs, IFS), - IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(iim, IIM), - IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(iha, IHA), - IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(lid, LID), - IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(ivr, IVR), - IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(tpr, TPR), - IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(eoi, EOI), - IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(irr0, IRR0), - IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(irr1, IRR1), - IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(irr2, IRR2), - IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(irr3, IRR3), - IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(itv, ITV), - IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(pmv, PMV), - IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(cmcv, CMCV), - IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(lrr0, LRR0), - IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(lrr1, LRR1), }; - -unsigned long __init_or_module -ia64_native_patch_bundle(void *sbundle, void *ebundle, unsigned long type) -{ - const unsigned long nelems = sizeof(ia64_native_patch_bundle_elems) / - sizeof(ia64_native_patch_bundle_elems[0]); - - return __paravirt_patch_apply_bundle(sbundle, ebundle, type, - ia64_native_patch_bundle_elems, - nelems, NULL); -} -#endif /* ASM_SUPPOTED */ - -extern const char ia64_native_switch_to[]; -extern const char ia64_native_leave_syscall[]; -extern const char ia64_native_work_processed_syscall[]; -extern const char ia64_native_leave_kernel[]; - -const struct paravirt_patch_branch_target ia64_native_branch_target[] -__initconst = { -#define PARAVIRT_BR_TARGET(name, type) \ - { \ - ia64_native_ ## name, \ - PARAVIRT_PATCH_TYPE_BR_ ## type, \ - } - PARAVIRT_BR_TARGET(switch_to, SWITCH_TO), - PARAVIRT_BR_TARGET(leave_syscall, LEAVE_SYSCALL), - PARAVIRT_BR_TARGET(work_processed_syscall, WORK_PROCESSED_SYSCALL), - PARAVIRT_BR_TARGET(leave_kernel, LEAVE_KERNEL), -}; - -static void __init -ia64_native_patch_branch(unsigned long tag, unsigned long type) -{ - const unsigned long nelem = - sizeof(ia64_native_branch_target) / - sizeof(ia64_native_branch_target[0]); - __paravirt_patch_apply_branch(tag, type, - ia64_native_branch_target, nelem); -} diff --git a/trunk/arch/ia64/kernel/paravirt_patch.c b/trunk/arch/ia64/kernel/paravirt_patch.c deleted file mode 100644 index bfdfef1b1ffd..000000000000 --- a/trunk/arch/ia64/kernel/paravirt_patch.c +++ /dev/null @@ -1,514 +0,0 @@ -/****************************************************************************** - * linux/arch/ia64/xen/paravirt_patch.c - * - * Copyright (c) 2008 Isaku Yamahata - * VA Linux Systems Japan K.K. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - */ - -#include -#include -#include -#include -#include - -typedef union ia64_inst { - struct { - unsigned long long qp : 6; - unsigned long long : 31; - unsigned long long opcode : 4; - unsigned long long reserved : 23; - } generic; - unsigned long long l; -} ia64_inst_t; - -/* - * flush_icache_range() can't be used here. - * we are here before cpu_init() which initializes - * ia64_i_cache_stride_shift. flush_icache_range() uses it. - */ -void __init_or_module -paravirt_flush_i_cache_range(const void *instr, unsigned long size) -{ - extern void paravirt_fc_i(const void *addr); - unsigned long i; - - for (i = 0; i < size; i += sizeof(bundle_t)) - paravirt_fc_i(instr + i); -} - -bundle_t* __init_or_module -paravirt_get_bundle(unsigned long tag) -{ - return (bundle_t *)(tag & ~3UL); -} - -unsigned long __init_or_module -paravirt_get_slot(unsigned long tag) -{ - return tag & 3UL; -} - -unsigned long __init_or_module -paravirt_get_num_inst(unsigned long stag, unsigned long etag) -{ - bundle_t *sbundle = paravirt_get_bundle(stag); - unsigned long sslot = paravirt_get_slot(stag); - bundle_t *ebundle = paravirt_get_bundle(etag); - unsigned long eslot = paravirt_get_slot(etag); - - return (ebundle - sbundle) * 3 + eslot - sslot + 1; -} - -unsigned long __init_or_module -paravirt_get_next_tag(unsigned long tag) -{ - unsigned long slot = paravirt_get_slot(tag); - - switch (slot) { - case 0: - case 1: - return tag + 1; - case 2: { - bundle_t *bundle = paravirt_get_bundle(tag); - return (unsigned long)(bundle + 1); - } - default: - BUG(); - } - /* NOTREACHED */ -} - -ia64_inst_t __init_or_module -paravirt_read_slot0(const bundle_t *bundle) -{ - ia64_inst_t inst; - inst.l = bundle->quad0.slot0; - return inst; -} - -ia64_inst_t __init_or_module -paravirt_read_slot1(const bundle_t *bundle) -{ - ia64_inst_t inst; - inst.l = bundle->quad0.slot1_p0 | - ((unsigned long long)bundle->quad1.slot1_p1 << 18UL); - return inst; -} - -ia64_inst_t __init_or_module -paravirt_read_slot2(const bundle_t *bundle) -{ - ia64_inst_t inst; - inst.l = bundle->quad1.slot2; - return inst; -} - -ia64_inst_t __init_or_module -paravirt_read_inst(unsigned long tag) -{ - bundle_t *bundle = paravirt_get_bundle(tag); - unsigned long slot = paravirt_get_slot(tag); - - switch (slot) { - case 0: - return paravirt_read_slot0(bundle); - case 1: - return paravirt_read_slot1(bundle); - case 2: - return paravirt_read_slot2(bundle); - default: - BUG(); - } - /* NOTREACHED */ -} - -void __init_or_module -paravirt_write_slot0(bundle_t *bundle, ia64_inst_t inst) -{ - bundle->quad0.slot0 = inst.l; -} - -void __init_or_module -paravirt_write_slot1(bundle_t *bundle, ia64_inst_t inst) -{ - bundle->quad0.slot1_p0 = inst.l; - bundle->quad1.slot1_p1 = inst.l >> 18UL; -} - -void __init_or_module -paravirt_write_slot2(bundle_t *bundle, ia64_inst_t inst) -{ - bundle->quad1.slot2 = inst.l; -} - -void __init_or_module -paravirt_write_inst(unsigned long tag, ia64_inst_t inst) -{ - bundle_t *bundle = paravirt_get_bundle(tag); - unsigned long slot = paravirt_get_slot(tag); - - switch (slot) { - case 0: - paravirt_write_slot0(bundle, inst); - break; - case 1: - paravirt_write_slot1(bundle, inst); - break; - case 2: - paravirt_write_slot2(bundle, inst); - break; - default: - BUG(); - break; - } - paravirt_flush_i_cache_range(bundle, sizeof(*bundle)); -} - -/* for debug */ -void -paravirt_print_bundle(const bundle_t *bundle) -{ - const unsigned long *quad = (const unsigned long *)bundle; - ia64_inst_t slot0 = paravirt_read_slot0(bundle); - ia64_inst_t slot1 = paravirt_read_slot1(bundle); - ia64_inst_t slot2 = paravirt_read_slot2(bundle); - - printk(KERN_DEBUG - "bundle 0x%p 0x%016lx 0x%016lx\n", bundle, quad[0], quad[1]); - printk(KERN_DEBUG - "bundle template 0x%x\n", - bundle->quad0.template); - printk(KERN_DEBUG - "slot0 0x%lx slot1_p0 0x%lx slot1_p1 0x%lx slot2 0x%lx\n", - (unsigned long)bundle->quad0.slot0, - (unsigned long)bundle->quad0.slot1_p0, - (unsigned long)bundle->quad1.slot1_p1, - (unsigned long)bundle->quad1.slot2); - printk(KERN_DEBUG - "slot0 0x%016llx slot1 0x%016llx slot2 0x%016llx\n", - slot0.l, slot1.l, slot2.l); -} - -static int noreplace_paravirt __init_or_module = 0; - -static int __init setup_noreplace_paravirt(char *str) -{ - noreplace_paravirt = 1; - return 1; -} -__setup("noreplace-paravirt", setup_noreplace_paravirt); - -#ifdef ASM_SUPPORTED -static void __init_or_module -fill_nop_bundle(void *sbundle, void *ebundle) -{ - extern const char paravirt_nop_bundle[]; - extern const unsigned long paravirt_nop_bundle_size; - - void *bundle = sbundle; - - BUG_ON((((unsigned long)sbundle) % sizeof(bundle_t)) != 0); - BUG_ON((((unsigned long)ebundle) % sizeof(bundle_t)) != 0); - - while (bundle < ebundle) { - memcpy(bundle, paravirt_nop_bundle, paravirt_nop_bundle_size); - - bundle += paravirt_nop_bundle_size; - } -} - -/* helper function */ -unsigned long __init_or_module -__paravirt_patch_apply_bundle(void *sbundle, void *ebundle, unsigned long type, - const struct paravirt_patch_bundle_elem *elems, - unsigned long nelems, - const struct paravirt_patch_bundle_elem **found) -{ - unsigned long used = 0; - unsigned long i; - - BUG_ON((((unsigned long)sbundle) % sizeof(bundle_t)) != 0); - BUG_ON((((unsigned long)ebundle) % sizeof(bundle_t)) != 0); - - found = NULL; - for (i = 0; i < nelems; i++) { - const struct paravirt_patch_bundle_elem *p = &elems[i]; - if (p->type == type) { - unsigned long need = p->ebundle - p->sbundle; - unsigned long room = ebundle - sbundle; - - if (found != NULL) - *found = p; - - if (room < need) { - /* no room to replace. skip it */ - printk(KERN_DEBUG - "the space is too small to put " - "bundles. type %ld need %ld room %ld\n", - type, need, room); - break; - } - - used = need; - memcpy(sbundle, p->sbundle, used); - break; - } - } - - return used; -} - -void __init_or_module -paravirt_patch_apply_bundle(const struct paravirt_patch_site_bundle *start, - const struct paravirt_patch_site_bundle *end) -{ - const struct paravirt_patch_site_bundle *p; - - if (noreplace_paravirt) - return; - if (pv_init_ops.patch_bundle == NULL) - return; - - for (p = start; p < end; p++) { - unsigned long used; - - used = (*pv_init_ops.patch_bundle)(p->sbundle, p->ebundle, - p->type); - if (used == 0) - continue; - - fill_nop_bundle(p->sbundle + used, p->ebundle); - paravirt_flush_i_cache_range(p->sbundle, - p->ebundle - p->sbundle); - } - ia64_sync_i(); - ia64_srlz_i(); -} - -/* - * nop.i, nop.m, nop.f instruction are same format. - * but nop.b has differennt format. - * This doesn't support nop.b for now. - */ -static void __init_or_module -fill_nop_inst(unsigned long stag, unsigned long etag) -{ - extern const bundle_t paravirt_nop_mfi_inst_bundle[]; - unsigned long tag; - const ia64_inst_t nop_inst = - paravirt_read_slot0(paravirt_nop_mfi_inst_bundle); - - for (tag = stag; tag < etag; tag = paravirt_get_next_tag(tag)) - paravirt_write_inst(tag, nop_inst); -} - -void __init_or_module -paravirt_patch_apply_inst(const struct paravirt_patch_site_inst *start, - const struct paravirt_patch_site_inst *end) -{ - const struct paravirt_patch_site_inst *p; - - if (noreplace_paravirt) - return; - if (pv_init_ops.patch_inst == NULL) - return; - - for (p = start; p < end; p++) { - unsigned long tag; - bundle_t *sbundle; - bundle_t *ebundle; - - tag = (*pv_init_ops.patch_inst)(p->stag, p->etag, p->type); - if (tag == p->stag) - continue; - - fill_nop_inst(tag, p->etag); - sbundle = paravirt_get_bundle(p->stag); - ebundle = paravirt_get_bundle(p->etag) + 1; - paravirt_flush_i_cache_range(sbundle, (ebundle - sbundle) * - sizeof(bundle_t)); - } - ia64_sync_i(); - ia64_srlz_i(); -} -#endif /* ASM_SUPPOTED */ - -/* brl.cond.sptk.many X3 */ -typedef union inst_x3_op { - ia64_inst_t inst; - struct { - unsigned long qp: 6; - unsigned long btyp: 3; - unsigned long unused: 3; - unsigned long p: 1; - unsigned long imm20b: 20; - unsigned long wh: 2; - unsigned long d: 1; - unsigned long i: 1; - unsigned long opcode: 4; - }; - unsigned long l; -} inst_x3_op_t; - -typedef union inst_x3_imm { - ia64_inst_t inst; - struct { - unsigned long unused: 2; - unsigned long imm39: 39; - }; - unsigned long l; -} inst_x3_imm_t; - -void __init_or_module -paravirt_patch_reloc_brl(unsigned long tag, const void *target) -{ - unsigned long tag_op = paravirt_get_next_tag(tag); - unsigned long tag_imm = tag; - bundle_t *bundle = paravirt_get_bundle(tag); - - ia64_inst_t inst_op = paravirt_read_inst(tag_op); - ia64_inst_t inst_imm = paravirt_read_inst(tag_imm); - - inst_x3_op_t inst_x3_op = { .l = inst_op.l }; - inst_x3_imm_t inst_x3_imm = { .l = inst_imm.l }; - - unsigned long imm60 = - ((unsigned long)target - (unsigned long)bundle) >> 4; - - BUG_ON(paravirt_get_slot(tag) != 1); /* MLX */ - BUG_ON(((unsigned long)target & (sizeof(bundle_t) - 1)) != 0); - - /* imm60[59] 1bit */ - inst_x3_op.i = (imm60 >> 59) & 1; - /* imm60[19:0] 20bit */ - inst_x3_op.imm20b = imm60 & ((1UL << 20) - 1); - /* imm60[58:20] 39bit */ - inst_x3_imm.imm39 = (imm60 >> 20) & ((1UL << 39) - 1); - - inst_op.l = inst_x3_op.l; - inst_imm.l = inst_x3_imm.l; - - paravirt_write_inst(tag_op, inst_op); - paravirt_write_inst(tag_imm, inst_imm); -} - -/* br.cond.sptk.many B1 */ -typedef union inst_b1 { - ia64_inst_t inst; - struct { - unsigned long qp: 6; - unsigned long btype: 3; - unsigned long unused: 3; - unsigned long p: 1; - unsigned long imm20b: 20; - unsigned long wh: 2; - unsigned long d: 1; - unsigned long s: 1; - unsigned long opcode: 4; - }; - unsigned long l; -} inst_b1_t; - -void __init -paravirt_patch_reloc_br(unsigned long tag, const void *target) -{ - bundle_t *bundle = paravirt_get_bundle(tag); - ia64_inst_t inst = paravirt_read_inst(tag); - unsigned long target25 = (unsigned long)target - (unsigned long)bundle; - inst_b1_t inst_b1; - - BUG_ON(((unsigned long)target & (sizeof(bundle_t) - 1)) != 0); - - inst_b1.l = inst.l; - if (target25 & (1UL << 63)) - inst_b1.s = 1; - else - inst_b1.s = 0; - - inst_b1.imm20b = target25 >> 4; - inst.l = inst_b1.l; - - paravirt_write_inst(tag, inst); -} - -void __init -__paravirt_patch_apply_branch( - unsigned long tag, unsigned long type, - const struct paravirt_patch_branch_target *entries, - unsigned int nr_entries) -{ - unsigned int i; - for (i = 0; i < nr_entries; i++) { - if (entries[i].type == type) { - paravirt_patch_reloc_br(tag, entries[i].entry); - break; - } - } -} - -static void __init -paravirt_patch_apply_branch(const struct paravirt_patch_site_branch *start, - const struct paravirt_patch_site_branch *end) -{ - const struct paravirt_patch_site_branch *p; - - if (noreplace_paravirt) - return; - if (pv_init_ops.patch_branch == NULL) - return; - - for (p = start; p < end; p++) - (*pv_init_ops.patch_branch)(p->tag, p->type); - - ia64_sync_i(); - ia64_srlz_i(); -} - -void __init -paravirt_patch_apply(void) -{ - extern const char __start_paravirt_bundles[]; - extern const char __stop_paravirt_bundles[]; - extern const char __start_paravirt_insts[]; - extern const char __stop_paravirt_insts[]; - extern const char __start_paravirt_branches[]; - extern const char __stop_paravirt_branches[]; - - paravirt_patch_apply_bundle((const struct paravirt_patch_site_bundle *) - __start_paravirt_bundles, - (const struct paravirt_patch_site_bundle *) - __stop_paravirt_bundles); - paravirt_patch_apply_inst((const struct paravirt_patch_site_inst *) - __start_paravirt_insts, - (const struct paravirt_patch_site_inst *) - __stop_paravirt_insts); - paravirt_patch_apply_branch((const struct paravirt_patch_site_branch *) - __start_paravirt_branches, - (const struct paravirt_patch_site_branch *) - __stop_paravirt_branches); -} - -/* - * Local variables: - * mode: C - * c-set-style: "linux" - * c-basic-offset: 8 - * tab-width: 8 - * indent-tabs-mode: t - * End: - */ diff --git a/trunk/arch/ia64/kernel/paravirt_patchlist.c b/trunk/arch/ia64/kernel/paravirt_patchlist.c deleted file mode 100644 index b28082a95d45..000000000000 --- a/trunk/arch/ia64/kernel/paravirt_patchlist.c +++ /dev/null @@ -1,79 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2008 Isaku Yamahata - * VA Linux Systems Japan K.K. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - */ - -#include -#include - -#define DECLARE(name) \ - extern unsigned long \ - __ia64_native_start_gate_##name##_patchlist[]; \ - extern unsigned long \ - __ia64_native_end_gate_##name##_patchlist[] - -DECLARE(fsyscall); -DECLARE(brl_fsys_bubble_down); -DECLARE(vtop); -DECLARE(mckinley_e9); - -extern unsigned long __start_gate_section[]; - -#define ASSIGN(name) \ - .start_##name##_patchlist = \ - (unsigned long)__ia64_native_start_gate_##name##_patchlist, \ - .end_##name##_patchlist = \ - (unsigned long)__ia64_native_end_gate_##name##_patchlist - -struct pv_patchdata pv_patchdata __initdata = { - ASSIGN(fsyscall), - ASSIGN(brl_fsys_bubble_down), - ASSIGN(vtop), - ASSIGN(mckinley_e9), - - .gate_section = (void*)__start_gate_section, -}; - - -unsigned long __init -paravirt_get_gate_patchlist(enum pv_gate_patchlist type) -{ - -#define CASE(NAME, name) \ - case PV_GATE_START_##NAME: \ - return pv_patchdata.start_##name##_patchlist; \ - case PV_GATE_END_##NAME: \ - return pv_patchdata.end_##name##_patchlist; \ - - switch (type) { - CASE(FSYSCALL, fsyscall); - CASE(BRL_FSYS_BUBBLE_DOWN, brl_fsys_bubble_down); - CASE(VTOP, vtop); - CASE(MCKINLEY_E9, mckinley_e9); - default: - BUG(); - break; - } - return 0; -} - -void * __init -paravirt_get_gate_section(void) -{ - return pv_patchdata.gate_section; -} diff --git a/trunk/arch/ia64/kernel/paravirt_patchlist.h b/trunk/arch/ia64/kernel/paravirt_patchlist.h deleted file mode 100644 index 0684aa6c6507..000000000000 --- a/trunk/arch/ia64/kernel/paravirt_patchlist.h +++ /dev/null @@ -1,28 +0,0 @@ -/****************************************************************************** - * linux/arch/ia64/xen/paravirt_patchlist.h - * - * Copyright (c) 2008 Isaku Yamahata - * VA Linux Systems Japan K.K. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - */ - -#if defined(__IA64_GATE_PARAVIRTUALIZED_XEN) -#include -#else -#include -#endif - diff --git a/trunk/arch/ia64/kernel/paravirtentry.S b/trunk/arch/ia64/kernel/paravirtentry.S index 6158560d7f17..2f42fcb9776a 100644 --- a/trunk/arch/ia64/kernel/paravirtentry.S +++ b/trunk/arch/ia64/kernel/paravirtentry.S @@ -20,11 +20,8 @@ * */ -#include #include #include -#include -#include #include "entry.h" #define DATA8(sym, init_value) \ @@ -35,87 +32,29 @@ data8 init_value ; \ .popsection -#define BRANCH(targ, reg, breg, type) \ - PARAVIRT_PATCH_SITE_BR(PARAVIRT_PATCH_TYPE_BR_ ## type) ; \ - ;; \ - movl reg=targ ; \ - ;; \ - ld8 reg=[reg] ; \ - ;; \ - mov breg=reg ; \ +#define BRANCH(targ, reg, breg) \ + movl reg=targ ; \ + ;; \ + ld8 reg=[reg] ; \ + ;; \ + mov breg=reg ; \ br.cond.sptk.many breg -#define BRANCH_PROC(sym, reg, breg, type) \ - DATA8(paravirt_ ## sym ## _targ, ia64_native_ ## sym) ; \ - GLOBAL_ENTRY(paravirt_ ## sym) ; \ - BRANCH(paravirt_ ## sym ## _targ, reg, breg, type) ; \ +#define BRANCH_PROC(sym, reg, breg) \ + DATA8(paravirt_ ## sym ## _targ, ia64_native_ ## sym) ; \ + GLOBAL_ENTRY(paravirt_ ## sym) ; \ + BRANCH(paravirt_ ## sym ## _targ, reg, breg) ; \ END(paravirt_ ## sym) -#define BRANCH_PROC_UNWINFO(sym, reg, breg, type) \ - DATA8(paravirt_ ## sym ## _targ, ia64_native_ ## sym) ; \ - GLOBAL_ENTRY(paravirt_ ## sym) ; \ - PT_REGS_UNWIND_INFO(0) ; \ - BRANCH(paravirt_ ## sym ## _targ, reg, breg, type) ; \ +#define BRANCH_PROC_UNWINFO(sym, reg, breg) \ + DATA8(paravirt_ ## sym ## _targ, ia64_native_ ## sym) ; \ + GLOBAL_ENTRY(paravirt_ ## sym) ; \ + PT_REGS_UNWIND_INFO(0) ; \ + BRANCH(paravirt_ ## sym ## _targ, reg, breg) ; \ END(paravirt_ ## sym) -BRANCH_PROC(switch_to, r22, b7, SWITCH_TO) -BRANCH_PROC_UNWINFO(leave_syscall, r22, b7, LEAVE_SYSCALL) -BRANCH_PROC(work_processed_syscall, r2, b7, WORK_PROCESSED_SYSCALL) -BRANCH_PROC_UNWINFO(leave_kernel, r22, b7, LEAVE_KERNEL) - - -#ifdef CONFIG_MODULES -#define __INIT_OR_MODULE .text -#define __INITDATA_OR_MODULE .data -#else -#define __INIT_OR_MODULE __INIT -#define __INITDATA_OR_MODULE __INITDATA -#endif /* CONFIG_MODULES */ - - __INIT_OR_MODULE - GLOBAL_ENTRY(paravirt_fc_i) - fc.i r32 - br.ret.sptk.many rp - END(paravirt_fc_i) - __FINIT - - __INIT_OR_MODULE - .align 32 - GLOBAL_ENTRY(paravirt_nop_b_inst_bundle) - { - nop.b 0 - nop.b 0 - nop.b 0 - } - END(paravirt_nop_b_inst_bundle) - __FINIT - - /* NOTE: nop.[mfi] has same format */ - __INIT_OR_MODULE - GLOBAL_ENTRY(paravirt_nop_mfi_inst_bundle) - { - nop.m 0 - nop.f 0 - nop.i 0 - } - END(paravirt_nop_mfi_inst_bundle) - __FINIT - - __INIT_OR_MODULE - GLOBAL_ENTRY(paravirt_nop_bundle) -paravirt_nop_bundle_start: - { - nop 0 - nop 0 - nop 0 - } -paravirt_nop_bundle_end: - END(paravirt_nop_bundle) - __FINIT - - __INITDATA_OR_MODULE - .align 8 - .global paravirt_nop_bundle_size -paravirt_nop_bundle_size: - data8 paravirt_nop_bundle_end - paravirt_nop_bundle_start +BRANCH_PROC(switch_to, r22, b7) +BRANCH_PROC_UNWINFO(leave_syscall, r22, b7) +BRANCH_PROC(work_processed_syscall, r2, b7) +BRANCH_PROC_UNWINFO(leave_kernel, r22, b7) diff --git a/trunk/arch/ia64/kernel/patch.c b/trunk/arch/ia64/kernel/patch.c index 68a1311db806..b83b2c516008 100644 --- a/trunk/arch/ia64/kernel/patch.c +++ b/trunk/arch/ia64/kernel/patch.c @@ -7,7 +7,6 @@ #include #include -#include #include #include #include @@ -170,35 +169,16 @@ ia64_patch_mckinley_e9 (unsigned long start, unsigned long end) ia64_srlz_i(); } -extern unsigned long ia64_native_fsyscall_table[NR_syscalls]; -extern char ia64_native_fsys_bubble_down[]; -struct pv_fsys_data pv_fsys_data __initdata = { - .fsyscall_table = (unsigned long *)ia64_native_fsyscall_table, - .fsys_bubble_down = (void *)ia64_native_fsys_bubble_down, -}; - -unsigned long * __init -paravirt_get_fsyscall_table(void) -{ - return pv_fsys_data.fsyscall_table; -} - -char * __init -paravirt_get_fsys_bubble_down(void) -{ - return pv_fsys_data.fsys_bubble_down; -} - static void __init patch_fsyscall_table (unsigned long start, unsigned long end) { - u64 fsyscall_table = (u64)paravirt_get_fsyscall_table(); + extern unsigned long fsyscall_table[NR_syscalls]; s32 *offp = (s32 *) start; u64 ip; while (offp < (s32 *) end) { ip = (u64) ia64_imva((char *) offp + *offp); - ia64_patch_imm64(ip, fsyscall_table); + ia64_patch_imm64(ip, (u64) fsyscall_table); ia64_fc((void *) ip); ++offp; } @@ -209,7 +189,7 @@ patch_fsyscall_table (unsigned long start, unsigned long end) static void __init patch_brl_fsys_bubble_down (unsigned long start, unsigned long end) { - u64 fsys_bubble_down = (u64)paravirt_get_fsys_bubble_down(); + extern char fsys_bubble_down[]; s32 *offp = (s32 *) start; u64 ip; @@ -227,13 +207,13 @@ patch_brl_fsys_bubble_down (unsigned long start, unsigned long end) void __init ia64_patch_gate (void) { -# define START(name) paravirt_get_gate_patchlist(PV_GATE_START_##name) -# define END(name) paravirt_get_gate_patchlist(PV_GATE_END_##name) +# define START(name) ((unsigned long) __start_gate_##name##_patchlist) +# define END(name) ((unsigned long)__end_gate_##name##_patchlist) - patch_fsyscall_table(START(FSYSCALL), END(FSYSCALL)); - patch_brl_fsys_bubble_down(START(BRL_FSYS_BUBBLE_DOWN), END(BRL_FSYS_BUBBLE_DOWN)); - ia64_patch_vtop(START(VTOP), END(VTOP)); - ia64_patch_mckinley_e9(START(MCKINLEY_E9), END(MCKINLEY_E9)); + patch_fsyscall_table(START(fsyscall), END(fsyscall)); + patch_brl_fsys_bubble_down(START(brl_fsys_bubble_down), END(brl_fsys_bubble_down)); + ia64_patch_vtop(START(vtop), END(vtop)); + ia64_patch_mckinley_e9(START(mckinley_e9), END(mckinley_e9)); } void ia64_patch_phys_stack_reg(unsigned long val) @@ -249,7 +229,7 @@ void ia64_patch_phys_stack_reg(unsigned long val) while (offp < end) { ip = (u64) offp + *offp; ia64_patch(ip, mask, imm); - ia64_fc((void *)ip); + ia64_fc(ip); ++offp; } ia64_sync_i(); diff --git a/trunk/arch/ia64/kernel/perfmon.c b/trunk/arch/ia64/kernel/perfmon.c index 8a06dc480594..5c0f408cfd71 100644 --- a/trunk/arch/ia64/kernel/perfmon.c +++ b/trunk/arch/ia64/kernel/perfmon.c @@ -5603,7 +5603,7 @@ pfm_interrupt_handler(int irq, void *arg) * /proc/perfmon interface, for debug only */ -#define PFM_PROC_SHOW_HEADER ((void *)nr_cpu_ids+1) +#define PFM_PROC_SHOW_HEADER ((void *)NR_CPUS+1) static void * pfm_proc_start(struct seq_file *m, loff_t *pos) @@ -5612,7 +5612,7 @@ pfm_proc_start(struct seq_file *m, loff_t *pos) return PFM_PROC_SHOW_HEADER; } - while (*pos <= nr_cpu_ids) { + while (*pos <= NR_CPUS) { if (cpu_online(*pos - 1)) { return (void *)*pos; } diff --git a/trunk/arch/ia64/kernel/salinfo.c b/trunk/arch/ia64/kernel/salinfo.c index 7053c55b7649..ecb9eb78d687 100644 --- a/trunk/arch/ia64/kernel/salinfo.c +++ b/trunk/arch/ia64/kernel/salinfo.c @@ -317,7 +317,7 @@ salinfo_event_read(struct file *file, char __user *buffer, size_t count, loff_t } n = data->cpu_check; - for (i = 0; i < nr_cpu_ids; i++) { + for (i = 0; i < NR_CPUS; i++) { if (cpu_isset(n, data->cpu_event)) { if (!cpu_online(n)) { cpu_clear(n, data->cpu_event); @@ -326,7 +326,7 @@ salinfo_event_read(struct file *file, char __user *buffer, size_t count, loff_t cpu = n; break; } - if (++n == nr_cpu_ids) + if (++n == NR_CPUS) n = 0; } @@ -337,7 +337,7 @@ salinfo_event_read(struct file *file, char __user *buffer, size_t count, loff_t /* for next read, start checking at next CPU */ data->cpu_check = cpu; - if (++data->cpu_check == nr_cpu_ids) + if (++data->cpu_check == NR_CPUS) data->cpu_check = 0; snprintf(cmd, sizeof(cmd), "read %d\n", cpu); diff --git a/trunk/arch/ia64/kernel/setup.c b/trunk/arch/ia64/kernel/setup.c index 714066aeda7f..865af27c7737 100644 --- a/trunk/arch/ia64/kernel/setup.c +++ b/trunk/arch/ia64/kernel/setup.c @@ -52,7 +52,6 @@ #include #include #include -#include #include #include #include @@ -538,7 +537,6 @@ setup_arch (char **cmdline_p) paravirt_arch_setup_early(); ia64_patch_vtop((u64) __start___vtop_patchlist, (u64) __end___vtop_patchlist); - paravirt_patch_apply(); *cmdline_p = __va(ia64_boot_param->command_line); strlcpy(boot_command_line, *cmdline_p, COMMAND_LINE_SIZE); @@ -732,10 +730,10 @@ static void * c_start (struct seq_file *m, loff_t *pos) { #ifdef CONFIG_SMP - while (*pos < nr_cpu_ids && !cpu_online(*pos)) + while (*pos < NR_CPUS && !cpu_isset(*pos, cpu_online_map)) ++*pos; #endif - return *pos < nr_cpu_ids ? cpu_data(*pos) : NULL; + return *pos < NR_CPUS ? cpu_data(*pos) : NULL; } static void * @@ -1018,7 +1016,8 @@ cpu_init (void) | IA64_DCR_DA | IA64_DCR_DD | IA64_DCR_LC)); atomic_inc(&init_mm.mm_count); current->active_mm = &init_mm; - BUG_ON(current->mm); + if (current->mm) + BUG(); ia64_mmu_init(ia64_imva(cpu_data)); ia64_mca_cpu_init(ia64_imva(cpu_data)); diff --git a/trunk/arch/ia64/kernel/smp.c b/trunk/arch/ia64/kernel/smp.c index 2ea4199d9c57..da8f020d82c1 100644 --- a/trunk/arch/ia64/kernel/smp.c +++ b/trunk/arch/ia64/kernel/smp.c @@ -166,11 +166,11 @@ send_IPI_allbutself (int op) * Called with preemption disabled. */ static inline void -send_IPI_mask(const struct cpumask *mask, int op) +send_IPI_mask(cpumask_t mask, int op) { unsigned int cpu; - for_each_cpu(cpu, mask) { + for_each_cpu_mask(cpu, mask) { send_IPI_single(cpu, op); } } @@ -316,7 +316,7 @@ void arch_send_call_function_single_ipi(int cpu) send_IPI_single(cpu, IPI_CALL_FUNC_SINGLE); } -void arch_send_call_function_ipi_mask(const struct cpumask *mask) +void arch_send_call_function_ipi(cpumask_t mask) { send_IPI_mask(mask, IPI_CALL_FUNC); } diff --git a/trunk/arch/ia64/kernel/smpboot.c b/trunk/arch/ia64/kernel/smpboot.c index 7700e23034bb..52290547c85b 100644 --- a/trunk/arch/ia64/kernel/smpboot.c +++ b/trunk/arch/ia64/kernel/smpboot.c @@ -581,14 +581,14 @@ smp_build_cpu_map (void) ia64_cpu_to_sapicid[0] = boot_cpu_id; cpus_clear(cpu_present_map); - set_cpu_present(0, true); - set_cpu_possible(0, true); + cpu_set(0, cpu_present_map); + cpu_set(0, cpu_possible_map); for (cpu = 1, i = 0; i < smp_boot_data.cpu_count; i++) { sapicid = smp_boot_data.cpu_phys_id[i]; if (sapicid == boot_cpu_id) continue; - set_cpu_present(cpu, true); - set_cpu_possible(cpu, true); + cpu_set(cpu, cpu_present_map); + cpu_set(cpu, cpu_possible_map); ia64_cpu_to_sapicid[cpu] = sapicid; cpu++; } @@ -626,9 +626,12 @@ smp_prepare_cpus (unsigned int max_cpus) */ if (!max_cpus) { printk(KERN_INFO "SMP mode deactivated.\n"); - init_cpu_online(cpumask_of(0)); - init_cpu_present(cpumask_of(0)); - init_cpu_possible(cpumask_of(0)); + cpus_clear(cpu_online_map); + cpus_clear(cpu_present_map); + cpus_clear(cpu_possible_map); + cpu_set(0, cpu_online_map); + cpu_set(0, cpu_present_map); + cpu_set(0, cpu_possible_map); return; } } diff --git a/trunk/arch/ia64/kernel/time.c b/trunk/arch/ia64/kernel/time.c index 641c8b61c4f1..d6747bae52d8 100644 --- a/trunk/arch/ia64/kernel/time.c +++ b/trunk/arch/ia64/kernel/time.c @@ -50,15 +50,6 @@ EXPORT_SYMBOL(last_cli_ip); #endif -#ifdef CONFIG_PARAVIRT -/* We need to define a real function for sched_clock, to override the - weak default version */ -unsigned long long sched_clock(void) -{ - return paravirt_sched_clock(); -} -#endif - #ifdef CONFIG_PARAVIRT static void paravirt_clocksource_resume(void) diff --git a/trunk/arch/ia64/kernel/vmlinux.lds.S b/trunk/arch/ia64/kernel/vmlinux.lds.S index 4a95e86b9ac2..3765efc5f963 100644 --- a/trunk/arch/ia64/kernel/vmlinux.lds.S +++ b/trunk/arch/ia64/kernel/vmlinux.lds.S @@ -169,30 +169,6 @@ SECTIONS __end___mckinley_e9_bundles = .; } -#if defined(CONFIG_PARAVIRT) - . = ALIGN(16); - .paravirt_bundles : AT(ADDR(.paravirt_bundles) - LOAD_OFFSET) - { - __start_paravirt_bundles = .; - *(.paravirt_bundles) - __stop_paravirt_bundles = .; - } - . = ALIGN(16); - .paravirt_insts : AT(ADDR(.paravirt_insts) - LOAD_OFFSET) - { - __start_paravirt_insts = .; - *(.paravirt_insts) - __stop_paravirt_insts = .; - } - . = ALIGN(16); - .paravirt_branches : AT(ADDR(.paravirt_branches) - LOAD_OFFSET) - { - __start_paravirt_branches = .; - *(.paravirt_branches) - __stop_paravirt_branches = .; - } -#endif - #if defined(CONFIG_IA64_GENERIC) /* Machine Vector */ . = ALIGN(16); @@ -225,12 +201,6 @@ SECTIONS __start_gate_section = .; *(.data.gate) __stop_gate_section = .; -#ifdef CONFIG_XEN - . = ALIGN(PAGE_SIZE); - __xen_start_gate_section = .; - *(.data.gate.xen) - __xen_stop_gate_section = .; -#endif } . = ALIGN(PAGE_SIZE); /* make sure the gate page doesn't expose * kernel data diff --git a/trunk/arch/ia64/kvm/kvm-ia64.c b/trunk/arch/ia64/kvm/kvm-ia64.c index 28af6a731bb8..076b00d1dbff 100644 --- a/trunk/arch/ia64/kvm/kvm-ia64.c +++ b/trunk/arch/ia64/kvm/kvm-ia64.c @@ -70,7 +70,7 @@ static void kvm_flush_icache(unsigned long start, unsigned long len) int l; for (l = 0; l < (len + 32); l += 32) - ia64_fc((void *)(start + l)); + ia64_fc(start + l); ia64_sync_i(); ia64_srlz_i(); diff --git a/trunk/arch/ia64/kvm/vcpu.c b/trunk/arch/ia64/kvm/vcpu.c index a18ee17b9192..d4d280505878 100644 --- a/trunk/arch/ia64/kvm/vcpu.c +++ b/trunk/arch/ia64/kvm/vcpu.c @@ -386,7 +386,7 @@ void set_rse_reg(struct kvm_pt_regs *regs, unsigned long r1, else *rnat_addr = (*rnat_addr) & (~nat_mask); - ia64_setreg(_IA64_REG_AR_BSPSTORE, (unsigned long)bspstore); + ia64_setreg(_IA64_REG_AR_BSPSTORE, bspstore); ia64_setreg(_IA64_REG_AR_RNAT, rnat); } local_irq_restore(psr); diff --git a/trunk/arch/ia64/kvm/vtlb.c b/trunk/arch/ia64/kvm/vtlb.c index 2c2501f13159..38232b37668b 100644 --- a/trunk/arch/ia64/kvm/vtlb.c +++ b/trunk/arch/ia64/kvm/vtlb.c @@ -210,7 +210,6 @@ void thash_vhpt_insert(struct kvm_vcpu *v, u64 pte, u64 itir, u64 va, int type) phy_pte &= ~PAGE_FLAGS_RV_MASK; psr = ia64_clear_ic(); ia64_itc(type, va, phy_pte, itir_ps(itir)); - paravirt_dv_serialize_data(); ia64_set_psr(psr); } @@ -457,7 +456,6 @@ void thash_purge_and_insert(struct kvm_vcpu *v, u64 pte, u64 itir, phy_pte &= ~PAGE_FLAGS_RV_MASK; psr = ia64_clear_ic(); ia64_itc(type, ifa, phy_pte, ps); - paravirt_dv_serialize_data(); ia64_set_psr(psr); } if (!(pte&VTLB_PTE_IO)) diff --git a/trunk/arch/ia64/mm/init.c b/trunk/arch/ia64/mm/init.c index c0f3bee69042..56e12903973c 100644 --- a/trunk/arch/ia64/mm/init.c +++ b/trunk/arch/ia64/mm/init.c @@ -35,7 +35,6 @@ #include #include #include -#include DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); @@ -260,7 +259,6 @@ put_kernel_page (struct page *page, unsigned long address, pgprot_t pgprot) static void __init setup_gate (void) { - void *gate_section; struct page *page; /* @@ -268,11 +266,10 @@ setup_gate (void) * headers etc. and once execute-only page to enable * privilege-promotion via "epc": */ - gate_section = paravirt_get_gate_section(); - page = virt_to_page(ia64_imva(gate_section)); + page = virt_to_page(ia64_imva(__start_gate_section)); put_kernel_page(page, GATE_ADDR, PAGE_READONLY); #ifdef HAVE_BUGGY_SEGREL - page = virt_to_page(ia64_imva(gate_section + PAGE_SIZE)); + page = virt_to_page(ia64_imva(__start_gate_section + PAGE_SIZE)); put_kernel_page(page, GATE_ADDR + PAGE_SIZE, PAGE_GATE); #else put_kernel_page(page, GATE_ADDR + PERCPU_PAGE_SIZE, PAGE_GATE); @@ -636,7 +633,8 @@ mem_init (void) #endif #ifdef CONFIG_FLATMEM - BUG_ON(!mem_map); + if (!mem_map) + BUG(); max_mapnr = max_low_pfn; #endif @@ -669,8 +667,8 @@ mem_init (void) * code can tell them apart. */ for (i = 0; i < NR_syscalls; ++i) { + extern unsigned long fsyscall_table[NR_syscalls]; extern unsigned long sys_call_table[NR_syscalls]; - unsigned long *fsyscall_table = paravirt_get_fsyscall_table(); if (!fsyscall_table[i] || nolwsys) fsyscall_table[i] = sys_call_table[i] | 1; diff --git a/trunk/arch/ia64/mm/tlb.c b/trunk/arch/ia64/mm/tlb.c index b9f3d7bbb338..bd9818a36b47 100644 --- a/trunk/arch/ia64/mm/tlb.c +++ b/trunk/arch/ia64/mm/tlb.c @@ -309,7 +309,7 @@ flush_tlb_range (struct vm_area_struct *vma, unsigned long start, preempt_disable(); #ifdef CONFIG_SMP - if (mm != current->active_mm || cpumask_weight(mm_cpumask(mm)) != 1) { + if (mm != current->active_mm || cpus_weight(mm->cpu_vm_mask) != 1) { platform_global_tlb_purge(mm, start, end, nbits); preempt_enable(); return; diff --git a/trunk/arch/ia64/scripts/pvcheck.sed b/trunk/arch/ia64/scripts/pvcheck.sed index e59809a3fc01..ba66ac2e4c60 100644 --- a/trunk/arch/ia64/scripts/pvcheck.sed +++ b/trunk/arch/ia64/scripts/pvcheck.sed @@ -17,7 +17,6 @@ s/mov.*=.*cr\.iip/.warning \"cr.iip should not used directly\"/g s/mov.*=.*cr\.ivr/.warning \"cr.ivr should not used directly\"/g s/mov.*=[^\.]*psr/.warning \"psr should not used directly\"/g # avoid ar.fpsr s/mov.*=.*ar\.eflags/.warning \"ar.eflags should not used directly\"/g -s/mov.*=.*ar\.itc.*/.warning \"ar.itc should not used directly\"/g s/mov.*cr\.ifa.*=.*/.warning \"cr.ifa should not used directly\"/g s/mov.*cr\.itir.*=.*/.warning \"cr.itir should not used directly\"/g s/mov.*cr\.iha.*=.*/.warning \"cr.iha should not used directly\"/g diff --git a/trunk/arch/ia64/sn/kernel/io_common.c b/trunk/arch/ia64/sn/kernel/io_common.c index 57f280dd9def..0d4ffa4da1da 100644 --- a/trunk/arch/ia64/sn/kernel/io_common.c +++ b/trunk/arch/ia64/sn/kernel/io_common.c @@ -135,7 +135,8 @@ static s64 sn_device_fixup_war(u64 nasid, u64 widget, int device, } war_list = kzalloc(DEV_PER_WIDGET * sizeof(*war_list), GFP_KERNEL); - BUG_ON(!war_list); + if (!war_list) + BUG(); SAL_CALL_NOLOCK(isrv, SN_SAL_IOIF_GET_WIDGET_DMAFLUSH_LIST, nasid, widget, __pa(war_list), 0, 0, 0 ,0); @@ -179,20 +180,23 @@ sn_common_hubdev_init(struct hubdev_info *hubdev) sizeof(struct sn_flush_device_kernel *); hubdev->hdi_flush_nasid_list.widget_p = kzalloc(size, GFP_KERNEL); - BUG_ON(!hubdev->hdi_flush_nasid_list.widget_p); + if (!hubdev->hdi_flush_nasid_list.widget_p) + BUG(); for (widget = 0; widget <= HUB_WIDGET_ID_MAX; widget++) { size = DEV_PER_WIDGET * sizeof(struct sn_flush_device_kernel); sn_flush_device_kernel = kzalloc(size, GFP_KERNEL); - BUG_ON(!sn_flush_device_kernel); + if (!sn_flush_device_kernel) + BUG(); dev_entry = sn_flush_device_kernel; for (device = 0; device < DEV_PER_WIDGET; device++, dev_entry++) { size = sizeof(struct sn_flush_device_common); dev_entry->common = kzalloc(size, GFP_KERNEL); - BUG_ON(!dev_entry->common); + if (!dev_entry->common) + BUG(); if (sn_prom_feature_available(PRF_DEVICE_FLUSH_LIST)) status = sal_get_device_dmaflush_list( hubdev->hdi_nasid, widget, device, @@ -322,7 +326,8 @@ sn_common_bus_fixup(struct pci_bus *bus, */ controller->platform_data = kzalloc(sizeof(struct sn_platform_data), GFP_KERNEL); - BUG_ON(controller->platform_data == NULL); + if (controller->platform_data == NULL) + BUG(); sn_platform_data = (struct sn_platform_data *) controller->platform_data; sn_platform_data->provider_soft = provider_soft; diff --git a/trunk/arch/ia64/sn/kernel/io_init.c b/trunk/arch/ia64/sn/kernel/io_init.c index ee774c366a06..e2eb2da60f96 100644 --- a/trunk/arch/ia64/sn/kernel/io_init.c +++ b/trunk/arch/ia64/sn/kernel/io_init.c @@ -128,7 +128,8 @@ sn_legacy_pci_window_fixup(struct pci_controller *controller, { controller->window = kcalloc(2, sizeof(struct pci_window), GFP_KERNEL); - BUG_ON(controller->window == NULL); + if (controller->window == NULL) + BUG(); controller->window[0].offset = legacy_io; controller->window[0].resource.name = "legacy_io"; controller->window[0].resource.flags = IORESOURCE_IO; @@ -167,7 +168,8 @@ sn_pci_window_fixup(struct pci_dev *dev, unsigned int count, idx = controller->windows; new_count = controller->windows + count; new_window = kcalloc(new_count, sizeof(struct pci_window), GFP_KERNEL); - BUG_ON(new_window == NULL); + if (new_window == NULL) + BUG(); if (controller->window) { memcpy(new_window, controller->window, sizeof(struct pci_window) * controller->windows); @@ -220,7 +222,8 @@ sn_io_slot_fixup(struct pci_dev *dev) (u64) __pa(pcidev_info), (u64) __pa(sn_irq_info)); - BUG_ON(status); /* Cannot get platform pci device information */ + if (status) + BUG(); /* Cannot get platform pci device information */ /* Copy over PIO Mapped Addresses */ @@ -304,7 +307,8 @@ sn_pci_controller_fixup(int segment, int busnum, struct pci_bus *bus) prom_bussoft_ptr = __va(prom_bussoft_ptr); controller = kzalloc(sizeof(*controller), GFP_KERNEL); - BUG_ON(!controller); + if (!controller) + BUG(); controller->segment = segment; /* diff --git a/trunk/arch/ia64/sn/kernel/setup.c b/trunk/arch/ia64/sn/kernel/setup.c index e456f062f241..02c5b8a9fb60 100644 --- a/trunk/arch/ia64/sn/kernel/setup.c +++ b/trunk/arch/ia64/sn/kernel/setup.c @@ -732,7 +732,8 @@ void __init build_cnode_tables(void) kl_config_hdr_t *klgraph_header; nasid = cnodeid_to_nasid(node); klgraph_header = ia64_sn_get_klconfig_addr(nasid); - BUG_ON(klgraph_header == NULL); + if (klgraph_header == NULL) + BUG(); brd = NODE_OFFSET_TO_LBOARD(nasid, klgraph_header->ch_board_info); while (brd) { if (board_needs_cnode(brd->brd_type) && physical_node_map[brd->brd_nasid] < 0) { @@ -749,7 +750,7 @@ nasid_slice_to_cpuid(int nasid, int slice) { long cpu; - for (cpu = 0; cpu < nr_cpu_ids; cpu++) + for (cpu = 0; cpu < NR_CPUS; cpu++) if (cpuid_to_nasid(cpu) == nasid && cpuid_to_slice(cpu) == slice) return cpu; diff --git a/trunk/arch/ia64/sn/kernel/sn2/sn2_smp.c b/trunk/arch/ia64/sn/kernel/sn2/sn2_smp.c index 1176506b2bae..e585f9a2afb9 100644 --- a/trunk/arch/ia64/sn/kernel/sn2/sn2_smp.c +++ b/trunk/arch/ia64/sn/kernel/sn2/sn2_smp.c @@ -133,7 +133,7 @@ sn2_ipi_flush_all_tlb(struct mm_struct *mm) unsigned long itc; itc = ia64_get_itc(); - smp_flush_tlb_cpumask(*mm_cpumask(mm)); + smp_flush_tlb_cpumask(mm->cpu_vm_mask); itc = ia64_get_itc() - itc; __get_cpu_var(ptcstats).shub_ipi_flushes_itc_clocks += itc; __get_cpu_var(ptcstats).shub_ipi_flushes++; @@ -182,7 +182,7 @@ sn2_global_tlb_purge(struct mm_struct *mm, unsigned long start, nodes_clear(nodes_flushed); i = 0; - for_each_cpu(cpu, mm_cpumask(mm)) { + for_each_cpu_mask(cpu, mm->cpu_vm_mask) { cnode = cpu_to_node(cpu); node_set(cnode, nodes_flushed); lcpu = cpu; @@ -461,7 +461,7 @@ bool sn_cpu_disable_allowed(int cpu) static void *sn2_ptc_seq_start(struct seq_file *file, loff_t * offset) { - if (*offset < nr_cpu_ids) + if (*offset < NR_CPUS) return offset; return NULL; } @@ -469,7 +469,7 @@ static void *sn2_ptc_seq_start(struct seq_file *file, loff_t * offset) static void *sn2_ptc_seq_next(struct seq_file *file, void *data, loff_t * offset) { (*offset)++; - if (*offset < nr_cpu_ids) + if (*offset < NR_CPUS) return offset; return NULL; } @@ -491,7 +491,7 @@ static int sn2_ptc_seq_show(struct seq_file *file, void *data) seq_printf(file, "# ptctest %d, flushopt %d\n", sn2_ptctest, sn2_flush_opt); } - if (cpu < nr_cpu_ids && cpu_online(cpu)) { + if (cpu < NR_CPUS && cpu_online(cpu)) { stat = &per_cpu(ptcstats, cpu); seq_printf(file, "cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n", cpu, stat->ptc_l, stat->change_rid, stat->shub_ptc_flushes, stat->nodes_flushed, @@ -554,7 +554,7 @@ static int __init sn2_ptc_init(void) proc_sn2_ptc = proc_create(PTC_BASENAME, 0444, NULL, &proc_sn2_ptc_operations); - if (!proc_sn2_ptc) { + if (!&proc_sn2_ptc_operations) { printk(KERN_ERR "unable to create %s proc entry", PTC_BASENAME); return -EINVAL; } diff --git a/trunk/arch/ia64/sn/kernel/sn2/sn_hwperf.c b/trunk/arch/ia64/sn/kernel/sn2/sn_hwperf.c index 9e6491cf72bd..be339477f906 100644 --- a/trunk/arch/ia64/sn/kernel/sn2/sn_hwperf.c +++ b/trunk/arch/ia64/sn/kernel/sn2/sn_hwperf.c @@ -275,7 +275,8 @@ static int sn_hwperf_get_nearest_node_objdata(struct sn_hwperf_object_info *objb /* get it's interconnect topology */ sz = op->ports * sizeof(struct sn_hwperf_port_info); - BUG_ON(sz > sizeof(ptdata)); + if (sz > sizeof(ptdata)) + BUG(); e = ia64_sn_hwperf_op(sn_hwperf_master_nasid, SN_HWPERF_ENUM_PORTS, nodeobj->id, sz, (u64)&ptdata, 0, 0, NULL); @@ -309,7 +310,8 @@ static int sn_hwperf_get_nearest_node_objdata(struct sn_hwperf_object_info *objb if (router && (!found_cpu || !found_mem)) { /* search for a node connected to the same router */ sz = router->ports * sizeof(struct sn_hwperf_port_info); - BUG_ON(sz > sizeof(ptdata)); + if (sz > sizeof(ptdata)) + BUG(); e = ia64_sn_hwperf_op(sn_hwperf_master_nasid, SN_HWPERF_ENUM_PORTS, router->id, sz, (u64)&ptdata, 0, 0, NULL); @@ -610,7 +612,7 @@ static int sn_hwperf_op_cpu(struct sn_hwperf_op_info *op_info) op_info->a->arg &= SN_HWPERF_ARG_OBJID_MASK; if (cpu != SN_HWPERF_ARG_ANY_CPU) { - if (cpu >= nr_cpu_ids || !cpu_online(cpu)) { + if (cpu >= NR_CPUS || !cpu_online(cpu)) { r = -EINVAL; goto out; } diff --git a/trunk/arch/ia64/sn/pci/pcibr/pcibr_dma.c b/trunk/arch/ia64/sn/pci/pcibr/pcibr_dma.c index c659ad5613a0..060df4aa9916 100644 --- a/trunk/arch/ia64/sn/pci/pcibr/pcibr_dma.c +++ b/trunk/arch/ia64/sn/pci/pcibr/pcibr_dma.c @@ -256,7 +256,9 @@ void sn_dma_flush(u64 addr) hubinfo = (NODEPDA(nasid_to_cnodeid(nasid)))->pdinfo; - BUG_ON(!hubinfo); + if (!hubinfo) { + BUG(); + } flush_nasid_list = &hubinfo->hdi_flush_nasid_list; if (flush_nasid_list->widget_p == NULL) diff --git a/trunk/arch/ia64/xen/Makefile b/trunk/arch/ia64/xen/Makefile index e6f4a0a74228..0ad0224693d9 100644 --- a/trunk/arch/ia64/xen/Makefile +++ b/trunk/arch/ia64/xen/Makefile @@ -3,29 +3,14 @@ # obj-y := hypercall.o xenivt.o xensetup.o xen_pv_ops.o irq_xen.o \ - hypervisor.o xencomm.o xcom_hcall.o grant-table.o time.o suspend.o \ - gate-data.o + hypervisor.o xencomm.o xcom_hcall.o grant-table.o time.o suspend.o obj-$(CONFIG_IA64_GENERIC) += machvec.o -# The gate DSO image is built using a special linker script. -include $(srctree)/arch/ia64/kernel/Makefile.gate - -# tell compiled for xen -CPPFLAGS_gate.lds += -D__IA64_GATE_PARAVIRTUALIZED_XEN -AFLAGS_gate.o += -D__IA64_ASM_PARAVIRTUALIZED_XEN -D__IA64_GATE_PARAVIRTUALIZED_XEN - -# use same file of native. -$(obj)/gate.o: $(src)/../kernel/gate.S FORCE - $(call if_changed_dep,as_o_S) -$(obj)/gate.lds: $(src)/../kernel/gate.lds.S FORCE - $(call if_changed_dep,cpp_lds_S) - - AFLAGS_xenivt.o += -D__IA64_ASM_PARAVIRTUALIZED_XEN # xen multi compile -ASM_PARAVIRT_MULTI_COMPILE_SRCS = ivt.S entry.S fsys.S +ASM_PARAVIRT_MULTI_COMPILE_SRCS = ivt.S entry.S ASM_PARAVIRT_OBJS = $(addprefix xen-,$(ASM_PARAVIRT_MULTI_COMPILE_SRCS:.S=.o)) obj-y += $(ASM_PARAVIRT_OBJS) define paravirtualized_xen diff --git a/trunk/arch/ia64/xen/gate-data.S b/trunk/arch/ia64/xen/gate-data.S deleted file mode 100644 index 7d4830afc91d..000000000000 --- a/trunk/arch/ia64/xen/gate-data.S +++ /dev/null @@ -1,3 +0,0 @@ - .section .data.gate.xen, "aw" - - .incbin "arch/ia64/xen/gate.so" diff --git a/trunk/arch/ia64/xen/hypercall.S b/trunk/arch/ia64/xen/hypercall.S index e32dae444dd6..45e02bb64a92 100644 --- a/trunk/arch/ia64/xen/hypercall.S +++ b/trunk/arch/ia64/xen/hypercall.S @@ -9,7 +9,6 @@ #include #include -#ifdef __INTEL_COMPILER /* * Hypercalls without parameter. */ @@ -73,7 +72,6 @@ GLOBAL_ENTRY(xen_set_rr0_to_rr4) br.ret.sptk.many rp ;; END(xen_set_rr0_to_rr4) -#endif GLOBAL_ENTRY(xen_send_ipi) mov r14=r32 diff --git a/trunk/arch/ia64/xen/time.c b/trunk/arch/ia64/xen/time.c index fb8332690179..68d6204c3f16 100644 --- a/trunk/arch/ia64/xen/time.c +++ b/trunk/arch/ia64/xen/time.c @@ -175,58 +175,10 @@ static void xen_itc_jitter_data_reset(void) } while (unlikely(ret != lcycle)); } -/* based on xen_sched_clock() in arch/x86/xen/time.c. */ -/* - * This relies on HAVE_UNSTABLE_SCHED_CLOCK. If it can't be defined, - * something similar logic should be implemented here. - */ -/* - * Xen sched_clock implementation. Returns the number of unstolen - * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED - * states. - */ -static unsigned long long xen_sched_clock(void) -{ - struct vcpu_runstate_info runstate; - - unsigned long long now; - unsigned long long offset; - unsigned long long ret; - - /* - * Ideally sched_clock should be called on a per-cpu basis - * anyway, so preempt should already be disabled, but that's - * not current practice at the moment. - */ - preempt_disable(); - - /* - * both ia64_native_sched_clock() and xen's runstate are - * based on mAR.ITC. So difference of them makes sense. - */ - now = ia64_native_sched_clock(); - - get_runstate_snapshot(&runstate); - - WARN_ON(runstate.state != RUNSTATE_running); - - offset = 0; - if (now > runstate.state_entry_time) - offset = now - runstate.state_entry_time; - ret = runstate.time[RUNSTATE_blocked] + - runstate.time[RUNSTATE_running] + - offset; - - preempt_enable(); - - return ret; -} - struct pv_time_ops xen_time_ops __initdata = { .init_missing_ticks_accounting = xen_init_missing_ticks_accounting, .do_steal_accounting = xen_do_steal_accounting, .clocksource_resume = xen_itc_jitter_data_reset, - .sched_clock = xen_sched_clock, }; /* Called after suspend, to resume time. */ diff --git a/trunk/arch/ia64/xen/xen_pv_ops.c b/trunk/arch/ia64/xen/xen_pv_ops.c index 5e2270a999fa..936cff3c96e0 100644 --- a/trunk/arch/ia64/xen/xen_pv_ops.c +++ b/trunk/arch/ia64/xen/xen_pv_ops.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include @@ -154,13 +153,6 @@ xen_post_smp_prepare_boot_cpu(void) xen_setup_vcpu_info_placement(); } -#ifdef ASM_SUPPORTED -static unsigned long __init_or_module -xen_patch_bundle(void *sbundle, void *ebundle, unsigned long type); -#endif -static void __init -xen_patch_branch(unsigned long tag, unsigned long type); - static const struct pv_init_ops xen_init_ops __initconst = { .banner = xen_banner, @@ -171,53 +163,6 @@ static const struct pv_init_ops xen_init_ops __initconst = { .arch_setup_nomca = xen_arch_setup_nomca, .post_smp_prepare_boot_cpu = xen_post_smp_prepare_boot_cpu, -#ifdef ASM_SUPPORTED - .patch_bundle = xen_patch_bundle, -#endif - .patch_branch = xen_patch_branch, -}; - -/*************************************************************************** - * pv_fsys_data - * addresses for fsys - */ - -extern unsigned long xen_fsyscall_table[NR_syscalls]; -extern char xen_fsys_bubble_down[]; -struct pv_fsys_data xen_fsys_data __initdata = { - .fsyscall_table = (unsigned long *)xen_fsyscall_table, - .fsys_bubble_down = (void *)xen_fsys_bubble_down, -}; - -/*************************************************************************** - * pv_patchdata - * patchdata addresses - */ - -#define DECLARE(name) \ - extern unsigned long __xen_start_gate_##name##_patchlist[]; \ - extern unsigned long __xen_end_gate_##name##_patchlist[] - -DECLARE(fsyscall); -DECLARE(brl_fsys_bubble_down); -DECLARE(vtop); -DECLARE(mckinley_e9); - -extern unsigned long __xen_start_gate_section[]; - -#define ASSIGN(name) \ - .start_##name##_patchlist = \ - (unsigned long)__xen_start_gate_##name##_patchlist, \ - .end_##name##_patchlist = \ - (unsigned long)__xen_end_gate_##name##_patchlist - -static struct pv_patchdata xen_patchdata __initdata = { - ASSIGN(fsyscall), - ASSIGN(brl_fsys_bubble_down), - ASSIGN(vtop), - ASSIGN(mckinley_e9), - - .gate_section = (void*)__xen_start_gate_section, }; /*************************************************************************** @@ -225,76 +170,6 @@ static struct pv_patchdata xen_patchdata __initdata = { * intrinsics hooks. */ -#ifndef ASM_SUPPORTED -static void -xen_set_itm_with_offset(unsigned long val) -{ - /* ia64_cpu_local_tick() calls this with interrupt enabled. */ - /* WARN_ON(!irqs_disabled()); */ - xen_set_itm(val - XEN_MAPPEDREGS->itc_offset); -} - -static unsigned long -xen_get_itm_with_offset(void) -{ - /* unused at this moment */ - printk(KERN_DEBUG "%s is called.\n", __func__); - - WARN_ON(!irqs_disabled()); - return ia64_native_getreg(_IA64_REG_CR_ITM) + - XEN_MAPPEDREGS->itc_offset; -} - -/* ia64_set_itc() is only called by - * cpu_init() with ia64_set_itc(0) and ia64_sync_itc(). - * So XEN_MAPPEDRESG->itc_offset cal be considered as almost constant. - */ -static void -xen_set_itc(unsigned long val) -{ - unsigned long mitc; - - WARN_ON(!irqs_disabled()); - mitc = ia64_native_getreg(_IA64_REG_AR_ITC); - XEN_MAPPEDREGS->itc_offset = val - mitc; - XEN_MAPPEDREGS->itc_last = val; -} - -static unsigned long -xen_get_itc(void) -{ - unsigned long res; - unsigned long itc_offset; - unsigned long itc_last; - unsigned long ret_itc_last; - - itc_offset = XEN_MAPPEDREGS->itc_offset; - do { - itc_last = XEN_MAPPEDREGS->itc_last; - res = ia64_native_getreg(_IA64_REG_AR_ITC); - res += itc_offset; - if (itc_last >= res) - res = itc_last + 1; - ret_itc_last = cmpxchg(&XEN_MAPPEDREGS->itc_last, - itc_last, res); - } while (unlikely(ret_itc_last != itc_last)); - return res; - -#if 0 - /* ia64_itc_udelay() calls ia64_get_itc() with interrupt enabled. - Should it be paravirtualized instead? */ - WARN_ON(!irqs_disabled()); - itc_offset = XEN_MAPPEDREGS->itc_offset; - itc_last = XEN_MAPPEDREGS->itc_last; - res = ia64_native_getreg(_IA64_REG_AR_ITC); - res += itc_offset; - if (itc_last >= res) - res = itc_last + 1; - XEN_MAPPEDREGS->itc_last = res; - return res; -#endif -} - static void xen_setreg(int regnum, unsigned long val) { switch (regnum) { @@ -306,14 +181,11 @@ static void xen_setreg(int regnum, unsigned long val) xen_set_eflag(val); break; #endif - case _IA64_REG_AR_ITC: - xen_set_itc(val); - break; case _IA64_REG_CR_TPR: xen_set_tpr(val); break; case _IA64_REG_CR_ITM: - xen_set_itm_with_offset(val); + xen_set_itm(val); break; case _IA64_REG_CR_EOI: xen_eoi(val); @@ -337,12 +209,6 @@ static unsigned long xen_getreg(int regnum) res = xen_get_eflag(); break; #endif - case _IA64_REG_AR_ITC: - res = xen_get_itc(); - break; - case _IA64_REG_CR_ITM: - res = xen_get_itm_with_offset(); - break; case _IA64_REG_CR_IVR: res = xen_get_ivr(); break; @@ -393,417 +259,8 @@ xen_intrin_local_irq_restore(unsigned long mask) else xen_rsm_i(); } -#else -#define __DEFINE_FUNC(name, code) \ - extern const char xen_ ## name ## _direct_start[]; \ - extern const char xen_ ## name ## _direct_end[]; \ - asm (".align 32\n" \ - ".proc xen_" #name "\n" \ - "xen_" #name ":\n" \ - "xen_" #name "_direct_start:\n" \ - code \ - "xen_" #name "_direct_end:\n" \ - "br.cond.sptk.many b6\n" \ - ".endp xen_" #name "\n") - -#define DEFINE_VOID_FUNC0(name, code) \ - extern void \ - xen_ ## name (void); \ - __DEFINE_FUNC(name, code) - -#define DEFINE_VOID_FUNC1(name, code) \ - extern void \ - xen_ ## name (unsigned long arg); \ - __DEFINE_FUNC(name, code) - -#define DEFINE_VOID_FUNC1_VOID(name, code) \ - extern void \ - xen_ ## name (void *arg); \ - __DEFINE_FUNC(name, code) - -#define DEFINE_VOID_FUNC2(name, code) \ - extern void \ - xen_ ## name (unsigned long arg0, \ - unsigned long arg1); \ - __DEFINE_FUNC(name, code) -#define DEFINE_FUNC0(name, code) \ - extern unsigned long \ - xen_ ## name (void); \ - __DEFINE_FUNC(name, code) - -#define DEFINE_FUNC1(name, type, code) \ - extern unsigned long \ - xen_ ## name (type arg); \ - __DEFINE_FUNC(name, code) - -#define XEN_PSR_I_ADDR_ADDR (XSI_BASE + XSI_PSR_I_ADDR_OFS) - -/* - * static void xen_set_itm_with_offset(unsigned long val) - * xen_set_itm(val - XEN_MAPPEDREGS->itc_offset); - */ -/* 2 bundles */ -DEFINE_VOID_FUNC1(set_itm_with_offset, - "mov r2 = " __stringify(XSI_BASE) " + " - __stringify(XSI_ITC_OFFSET_OFS) "\n" - ";;\n" - "ld8 r3 = [r2]\n" - ";;\n" - "sub r8 = r8, r3\n" - "break " __stringify(HYPERPRIVOP_SET_ITM) "\n"); - -/* - * static unsigned long xen_get_itm_with_offset(void) - * return ia64_native_getreg(_IA64_REG_CR_ITM) + XEN_MAPPEDREGS->itc_offset; - */ -/* 2 bundles */ -DEFINE_FUNC0(get_itm_with_offset, - "mov r2 = " __stringify(XSI_BASE) " + " - __stringify(XSI_ITC_OFFSET_OFS) "\n" - ";;\n" - "ld8 r3 = [r2]\n" - "mov r8 = cr.itm\n" - ";;\n" - "add r8 = r8, r2\n"); - -/* - * static void xen_set_itc(unsigned long val) - * unsigned long mitc; - * - * WARN_ON(!irqs_disabled()); - * mitc = ia64_native_getreg(_IA64_REG_AR_ITC); - * XEN_MAPPEDREGS->itc_offset = val - mitc; - * XEN_MAPPEDREGS->itc_last = val; - */ -/* 2 bundles */ -DEFINE_VOID_FUNC1(set_itc, - "mov r2 = " __stringify(XSI_BASE) " + " - __stringify(XSI_ITC_LAST_OFS) "\n" - "mov r3 = ar.itc\n" - ";;\n" - "sub r3 = r8, r3\n" - "st8 [r2] = r8, " - __stringify(XSI_ITC_LAST_OFS) " - " - __stringify(XSI_ITC_OFFSET_OFS) "\n" - ";;\n" - "st8 [r2] = r3\n"); - -/* - * static unsigned long xen_get_itc(void) - * unsigned long res; - * unsigned long itc_offset; - * unsigned long itc_last; - * unsigned long ret_itc_last; - * - * itc_offset = XEN_MAPPEDREGS->itc_offset; - * do { - * itc_last = XEN_MAPPEDREGS->itc_last; - * res = ia64_native_getreg(_IA64_REG_AR_ITC); - * res += itc_offset; - * if (itc_last >= res) - * res = itc_last + 1; - * ret_itc_last = cmpxchg(&XEN_MAPPEDREGS->itc_last, - * itc_last, res); - * } while (unlikely(ret_itc_last != itc_last)); - * return res; - */ -/* 5 bundles */ -DEFINE_FUNC0(get_itc, - "mov r2 = " __stringify(XSI_BASE) " + " - __stringify(XSI_ITC_OFFSET_OFS) "\n" - ";;\n" - "ld8 r9 = [r2], " __stringify(XSI_ITC_LAST_OFS) " - " - __stringify(XSI_ITC_OFFSET_OFS) "\n" - /* r9 = itc_offset */ - /* r2 = XSI_ITC_OFFSET */ - "888:\n" - "mov r8 = ar.itc\n" /* res = ar.itc */ - ";;\n" - "ld8 r3 = [r2]\n" /* r3 = itc_last */ - "add r8 = r8, r9\n" /* res = ar.itc + itc_offset */ - ";;\n" - "cmp.gtu p6, p0 = r3, r8\n" - ";;\n" - "(p6) add r8 = 1, r3\n" /* if (itc_last > res) itc_last + 1 */ - ";;\n" - "mov ar.ccv = r8\n" - ";;\n" - "cmpxchg8.acq r10 = [r2], r8, ar.ccv\n" - ";;\n" - "cmp.ne p6, p0 = r10, r3\n" - "(p6) hint @pause\n" - "(p6) br.cond.spnt 888b\n"); - -DEFINE_VOID_FUNC1_VOID(fc, - "break " __stringify(HYPERPRIVOP_FC) "\n"); - -/* - * psr_i_addr_addr = XEN_PSR_I_ADDR_ADDR - * masked_addr = *psr_i_addr_addr - * pending_intr_addr = masked_addr - 1 - * if (val & IA64_PSR_I) { - * masked = *masked_addr - * *masked_addr = 0:xen_set_virtual_psr_i(1) - * compiler barrier - * if (masked) { - * uint8_t pending = *pending_intr_addr; - * if (pending) - * XEN_HYPER_SSM_I - * } - * } else { - * *masked_addr = 1:xen_set_virtual_psr_i(0) - * } - */ -/* 6 bundles */ -DEFINE_VOID_FUNC1(intrin_local_irq_restore, - /* r8 = input value: 0 or IA64_PSR_I - * p6 = (flags & IA64_PSR_I) - * = if clause - * p7 = !(flags & IA64_PSR_I) - * = else clause - */ - "cmp.ne p6, p7 = r8, r0\n" - "mov r9 = " __stringify(XEN_PSR_I_ADDR_ADDR) "\n" - ";;\n" - /* r9 = XEN_PSR_I_ADDR */ - "ld8 r9 = [r9]\n" - ";;\n" - - /* r10 = masked previous value */ - "(p6) ld1.acq r10 = [r9]\n" - ";;\n" - - /* p8 = !masked interrupt masked previously? */ - "(p6) cmp.ne.unc p8, p0 = r10, r0\n" - - /* p7 = else clause */ - "(p7) mov r11 = 1\n" - ";;\n" - /* masked = 1 */ - "(p7) st1.rel [r9] = r11\n" - - /* p6 = if clause */ - /* masked = 0 - * r9 = masked_addr - 1 - * = pending_intr_addr - */ - "(p8) st1.rel [r9] = r0, -1\n" - ";;\n" - /* r8 = pending_intr */ - "(p8) ld1.acq r11 = [r9]\n" - ";;\n" - /* p9 = interrupt pending? */ - "(p8) cmp.ne.unc p9, p10 = r11, r0\n" - ";;\n" - "(p10) mf\n" - /* issue hypercall to trigger interrupt */ - "(p9) break " __stringify(HYPERPRIVOP_SSM_I) "\n"); - -DEFINE_VOID_FUNC2(ptcga, - "break " __stringify(HYPERPRIVOP_PTC_GA) "\n"); -DEFINE_VOID_FUNC2(set_rr, - "break " __stringify(HYPERPRIVOP_SET_RR) "\n"); - -/* - * tmp = XEN_MAPPEDREGS->interrupt_mask_addr = XEN_PSR_I_ADDR_ADDR; - * tmp = *tmp - * tmp = *tmp; - * psr_i = tmp? 0: IA64_PSR_I; - */ -/* 4 bundles */ -DEFINE_FUNC0(get_psr_i, - "mov r9 = " __stringify(XEN_PSR_I_ADDR_ADDR) "\n" - ";;\n" - "ld8 r9 = [r9]\n" /* r9 = XEN_PSR_I_ADDR */ - "mov r8 = 0\n" /* psr_i = 0 */ - ";;\n" - "ld1.acq r9 = [r9]\n" /* r9 = XEN_PSR_I */ - ";;\n" - "cmp.eq.unc p6, p0 = r9, r0\n" /* p6 = (XEN_PSR_I != 0) */ - ";;\n" - "(p6) mov r8 = " __stringify(1 << IA64_PSR_I_BIT) "\n"); - -DEFINE_FUNC1(thash, unsigned long, - "break " __stringify(HYPERPRIVOP_THASH) "\n"); -DEFINE_FUNC1(get_cpuid, int, - "break " __stringify(HYPERPRIVOP_GET_CPUID) "\n"); -DEFINE_FUNC1(get_pmd, int, - "break " __stringify(HYPERPRIVOP_GET_PMD) "\n"); -DEFINE_FUNC1(get_rr, unsigned long, - "break " __stringify(HYPERPRIVOP_GET_RR) "\n"); - -/* - * void xen_privop_ssm_i(void) - * - * int masked = !xen_get_virtual_psr_i(); - * // masked = *(*XEN_MAPPEDREGS->interrupt_mask_addr) - * xen_set_virtual_psr_i(1) - * // *(*XEN_MAPPEDREGS->interrupt_mask_addr) = 0 - * // compiler barrier - * if (masked) { - * uint8_t* pend_int_addr = - * (uint8_t*)(*XEN_MAPPEDREGS->interrupt_mask_addr) - 1; - * uint8_t pending = *pend_int_addr; - * if (pending) - * XEN_HYPER_SSM_I - * } - */ -/* 4 bundles */ -DEFINE_VOID_FUNC0(ssm_i, - "mov r8 = " __stringify(XEN_PSR_I_ADDR_ADDR) "\n" - ";;\n" - "ld8 r8 = [r8]\n" /* r8 = XEN_PSR_I_ADDR */ - ";;\n" - "ld1.acq r9 = [r8]\n" /* r9 = XEN_PSR_I */ - ";;\n" - "st1.rel [r8] = r0, -1\n" /* psr_i = 0. enable interrupt - * r8 = XEN_PSR_I_ADDR - 1 - * = pend_int_addr - */ - "cmp.eq.unc p0, p6 = r9, r0\n"/* p6 = !XEN_PSR_I - * previously interrupt - * masked? - */ - ";;\n" - "(p6) ld1.acq r8 = [r8]\n" /* r8 = xen_pend_int */ - ";;\n" - "(p6) cmp.eq.unc p6, p7 = r8, r0\n" /*interrupt pending?*/ - ";;\n" - /* issue hypercall to get interrupt */ - "(p7) break " __stringify(HYPERPRIVOP_SSM_I) "\n" - ";;\n"); - -/* - * psr_i_addr_addr = XEN_MAPPEDREGS->interrupt_mask_addr - * = XEN_PSR_I_ADDR_ADDR; - * psr_i_addr = *psr_i_addr_addr; - * *psr_i_addr = 1; - */ -/* 2 bundles */ -DEFINE_VOID_FUNC0(rsm_i, - "mov r8 = " __stringify(XEN_PSR_I_ADDR_ADDR) "\n" - /* r8 = XEN_PSR_I_ADDR */ - "mov r9 = 1\n" - ";;\n" - "ld8 r8 = [r8]\n" /* r8 = XEN_PSR_I */ - ";;\n" - "st1.rel [r8] = r9\n"); /* XEN_PSR_I = 1 */ - -extern void -xen_set_rr0_to_rr4(unsigned long val0, unsigned long val1, - unsigned long val2, unsigned long val3, - unsigned long val4); -__DEFINE_FUNC(set_rr0_to_rr4, - "break " __stringify(HYPERPRIVOP_SET_RR0_TO_RR4) "\n"); - - -extern unsigned long xen_getreg(int regnum); -#define __DEFINE_GET_REG(id, privop) \ - "mov r2 = " __stringify(_IA64_REG_ ## id) "\n" \ - ";;\n" \ - "cmp.eq p6, p0 = r2, r8\n" \ - ";;\n" \ - "(p6) break " __stringify(HYPERPRIVOP_GET_ ## privop) "\n" \ - "(p6) br.cond.sptk.many b6\n" \ - ";;\n" - -__DEFINE_FUNC(getreg, - __DEFINE_GET_REG(PSR, PSR) -#ifdef CONFIG_IA32_SUPPORT - __DEFINE_GET_REG(AR_EFLAG, EFLAG) -#endif - - /* get_itc */ - "mov r2 = " __stringify(_IA64_REG_AR_ITC) "\n" - ";;\n" - "cmp.eq p6, p0 = r2, r8\n" - ";;\n" - "(p6) br.cond.spnt xen_get_itc\n" - ";;\n" - - /* get itm */ - "mov r2 = " __stringify(_IA64_REG_CR_ITM) "\n" - ";;\n" - "cmp.eq p6, p0 = r2, r8\n" - ";;\n" - "(p6) br.cond.spnt xen_get_itm_with_offset\n" - ";;\n" - - __DEFINE_GET_REG(CR_IVR, IVR) - __DEFINE_GET_REG(CR_TPR, TPR) - - /* fall back */ - "movl r2 = ia64_native_getreg_func\n" - ";;\n" - "mov b7 = r2\n" - ";;\n" - "br.cond.sptk.many b7\n"); - -extern void xen_setreg(int regnum, unsigned long val); -#define __DEFINE_SET_REG(id, privop) \ - "mov r2 = " __stringify(_IA64_REG_ ## id) "\n" \ - ";;\n" \ - "cmp.eq p6, p0 = r2, r9\n" \ - ";;\n" \ - "(p6) break " __stringify(HYPERPRIVOP_ ## privop) "\n" \ - "(p6) br.cond.sptk.many b6\n" \ - ";;\n" - -__DEFINE_FUNC(setreg, - /* kr0 .. kr 7*/ - /* - * if (_IA64_REG_AR_KR0 <= regnum && - * regnum <= _IA64_REG_AR_KR7) { - * register __index asm ("r8") = regnum - _IA64_REG_AR_KR0 - * register __val asm ("r9") = val - * "break HYPERPRIVOP_SET_KR" - * } - */ - "mov r17 = r9\n" - "mov r2 = " __stringify(_IA64_REG_AR_KR0) "\n" - ";;\n" - "cmp.ge p6, p0 = r9, r2\n" - "sub r17 = r17, r2\n" - ";;\n" - "(p6) cmp.ge.unc p7, p0 = " - __stringify(_IA64_REG_AR_KR7) " - " __stringify(_IA64_REG_AR_KR0) - ", r17\n" - ";;\n" - "(p7) mov r9 = r8\n" - ";;\n" - "(p7) mov r8 = r17\n" - "(p7) break " __stringify(HYPERPRIVOP_SET_KR) "\n" - - /* set itm */ - "mov r2 = " __stringify(_IA64_REG_CR_ITM) "\n" - ";;\n" - "cmp.eq p6, p0 = r2, r8\n" - ";;\n" - "(p6) br.cond.spnt xen_set_itm_with_offset\n" - - /* set itc */ - "mov r2 = " __stringify(_IA64_REG_AR_ITC) "\n" - ";;\n" - "cmp.eq p6, p0 = r2, r8\n" - ";;\n" - "(p6) br.cond.spnt xen_set_itc\n" - -#ifdef CONFIG_IA32_SUPPORT - __DEFINE_SET_REG(AR_EFLAG, SET_EFLAG) -#endif - __DEFINE_SET_REG(CR_TPR, SET_TPR) - __DEFINE_SET_REG(CR_EOI, EOI) - - /* fall back */ - "movl r2 = ia64_native_setreg_func\n" - ";;\n" - "mov b7 = r2\n" - ";;\n" - "br.cond.sptk.many b7\n"); -#endif - -static const struct pv_cpu_ops xen_cpu_ops __initconst = { +static const struct pv_cpu_ops xen_cpu_ops __initdata = { .fc = xen_fc, .thash = xen_thash, .get_cpuid = xen_get_cpuid, @@ -880,7 +337,7 @@ xen_iosapic_write(char __iomem *iosapic, unsigned int reg, u32 val) HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op); } -static struct pv_iosapic_ops xen_iosapic_ops __initdata = { +static const struct pv_iosapic_ops xen_iosapic_ops __initconst = { .pcat_compat_init = xen_pcat_compat_init, .__get_irq_chip = xen_iosapic_get_irq_chip, @@ -898,8 +355,6 @@ xen_setup_pv_ops(void) xen_info_init(); pv_info = xen_info; pv_init_ops = xen_init_ops; - pv_fsys_data = xen_fsys_data; - pv_patchdata = xen_patchdata; pv_cpu_ops = xen_cpu_ops; pv_iosapic_ops = xen_iosapic_ops; pv_irq_ops = xen_irq_ops; @@ -907,252 +362,3 @@ xen_setup_pv_ops(void) paravirt_cpu_asm_init(&xen_cpu_asm_switch); } - -#ifdef ASM_SUPPORTED -/*************************************************************************** - * binary pacthing - * pv_init_ops.patch_bundle - */ - -#define DEFINE_FUNC_GETREG(name, privop) \ - DEFINE_FUNC0(get_ ## name, \ - "break "__stringify(HYPERPRIVOP_GET_ ## privop) "\n") - -DEFINE_FUNC_GETREG(psr, PSR); -DEFINE_FUNC_GETREG(eflag, EFLAG); -DEFINE_FUNC_GETREG(ivr, IVR); -DEFINE_FUNC_GETREG(tpr, TPR); - -#define DEFINE_FUNC_SET_KR(n) \ - DEFINE_VOID_FUNC0(set_kr ## n, \ - ";;\n" \ - "mov r9 = r8\n" \ - "mov r8 = " #n "\n" \ - "break " __stringify(HYPERPRIVOP_SET_KR) "\n") - -DEFINE_FUNC_SET_KR(0); -DEFINE_FUNC_SET_KR(1); -DEFINE_FUNC_SET_KR(2); -DEFINE_FUNC_SET_KR(3); -DEFINE_FUNC_SET_KR(4); -DEFINE_FUNC_SET_KR(5); -DEFINE_FUNC_SET_KR(6); -DEFINE_FUNC_SET_KR(7); - -#define __DEFINE_FUNC_SETREG(name, privop) \ - DEFINE_VOID_FUNC0(name, \ - "break "__stringify(HYPERPRIVOP_ ## privop) "\n") - -#define DEFINE_FUNC_SETREG(name, privop) \ - __DEFINE_FUNC_SETREG(set_ ## name, SET_ ## privop) - -DEFINE_FUNC_SETREG(eflag, EFLAG); -DEFINE_FUNC_SETREG(tpr, TPR); -__DEFINE_FUNC_SETREG(eoi, EOI); - -extern const char xen_check_events[]; -extern const char __xen_intrin_local_irq_restore_direct_start[]; -extern const char __xen_intrin_local_irq_restore_direct_end[]; -extern const unsigned long __xen_intrin_local_irq_restore_direct_reloc; - -asm ( - ".align 32\n" - ".proc xen_check_events\n" - "xen_check_events:\n" - /* masked = 0 - * r9 = masked_addr - 1 - * = pending_intr_addr - */ - "st1.rel [r9] = r0, -1\n" - ";;\n" - /* r8 = pending_intr */ - "ld1.acq r11 = [r9]\n" - ";;\n" - /* p9 = interrupt pending? */ - "cmp.ne p9, p10 = r11, r0\n" - ";;\n" - "(p10) mf\n" - /* issue hypercall to trigger interrupt */ - "(p9) break " __stringify(HYPERPRIVOP_SSM_I) "\n" - "br.cond.sptk.many b6\n" - ".endp xen_check_events\n" - "\n" - ".align 32\n" - ".proc __xen_intrin_local_irq_restore_direct\n" - "__xen_intrin_local_irq_restore_direct:\n" - "__xen_intrin_local_irq_restore_direct_start:\n" - "1:\n" - "{\n" - "cmp.ne p6, p7 = r8, r0\n" - "mov r17 = ip\n" /* get ip to calc return address */ - "mov r9 = "__stringify(XEN_PSR_I_ADDR_ADDR) "\n" - ";;\n" - "}\n" - "{\n" - /* r9 = XEN_PSR_I_ADDR */ - "ld8 r9 = [r9]\n" - ";;\n" - /* r10 = masked previous value */ - "(p6) ld1.acq r10 = [r9]\n" - "adds r17 = 1f - 1b, r17\n" /* calculate return address */ - ";;\n" - "}\n" - "{\n" - /* p8 = !masked interrupt masked previously? */ - "(p6) cmp.ne.unc p8, p0 = r10, r0\n" - "\n" - /* p7 = else clause */ - "(p7) mov r11 = 1\n" - ";;\n" - "(p8) mov b6 = r17\n" /* set return address */ - "}\n" - "{\n" - /* masked = 1 */ - "(p7) st1.rel [r9] = r11\n" - "\n" - "[99:]\n" - "(p8) brl.cond.dptk.few xen_check_events\n" - "}\n" - /* pv calling stub is 5 bundles. fill nop to adjust return address */ - "{\n" - "nop 0\n" - "nop 0\n" - "nop 0\n" - "}\n" - "1:\n" - "__xen_intrin_local_irq_restore_direct_end:\n" - ".endp __xen_intrin_local_irq_restore_direct\n" - "\n" - ".align 8\n" - "__xen_intrin_local_irq_restore_direct_reloc:\n" - "data8 99b\n" -); - -static struct paravirt_patch_bundle_elem xen_patch_bundle_elems[] -__initdata_or_module = -{ -#define XEN_PATCH_BUNDLE_ELEM(name, type) \ - { \ - (void*)xen_ ## name ## _direct_start, \ - (void*)xen_ ## name ## _direct_end, \ - PARAVIRT_PATCH_TYPE_ ## type, \ - } - - XEN_PATCH_BUNDLE_ELEM(fc, FC), - XEN_PATCH_BUNDLE_ELEM(thash, THASH), - XEN_PATCH_BUNDLE_ELEM(get_cpuid, GET_CPUID), - XEN_PATCH_BUNDLE_ELEM(get_pmd, GET_PMD), - XEN_PATCH_BUNDLE_ELEM(ptcga, PTCGA), - XEN_PATCH_BUNDLE_ELEM(get_rr, GET_RR), - XEN_PATCH_BUNDLE_ELEM(set_rr, SET_RR), - XEN_PATCH_BUNDLE_ELEM(set_rr0_to_rr4, SET_RR0_TO_RR4), - XEN_PATCH_BUNDLE_ELEM(ssm_i, SSM_I), - XEN_PATCH_BUNDLE_ELEM(rsm_i, RSM_I), - XEN_PATCH_BUNDLE_ELEM(get_psr_i, GET_PSR_I), - { - (void*)__xen_intrin_local_irq_restore_direct_start, - (void*)__xen_intrin_local_irq_restore_direct_end, - PARAVIRT_PATCH_TYPE_INTRIN_LOCAL_IRQ_RESTORE, - }, - -#define XEN_PATCH_BUNDLE_ELEM_GETREG(name, reg) \ - { \ - xen_get_ ## name ## _direct_start, \ - xen_get_ ## name ## _direct_end, \ - PARAVIRT_PATCH_TYPE_GETREG + _IA64_REG_ ## reg, \ - } - - XEN_PATCH_BUNDLE_ELEM_GETREG(psr, PSR), - XEN_PATCH_BUNDLE_ELEM_GETREG(eflag, AR_EFLAG), - - XEN_PATCH_BUNDLE_ELEM_GETREG(ivr, CR_IVR), - XEN_PATCH_BUNDLE_ELEM_GETREG(tpr, CR_TPR), - - XEN_PATCH_BUNDLE_ELEM_GETREG(itc, AR_ITC), - XEN_PATCH_BUNDLE_ELEM_GETREG(itm_with_offset, CR_ITM), - - -#define __XEN_PATCH_BUNDLE_ELEM_SETREG(name, reg) \ - { \ - xen_ ## name ## _direct_start, \ - xen_ ## name ## _direct_end, \ - PARAVIRT_PATCH_TYPE_SETREG + _IA64_REG_ ## reg, \ - } - -#define XEN_PATCH_BUNDLE_ELEM_SETREG(name, reg) \ - __XEN_PATCH_BUNDLE_ELEM_SETREG(set_ ## name, reg) - - XEN_PATCH_BUNDLE_ELEM_SETREG(kr0, AR_KR0), - XEN_PATCH_BUNDLE_ELEM_SETREG(kr1, AR_KR1), - XEN_PATCH_BUNDLE_ELEM_SETREG(kr2, AR_KR2), - XEN_PATCH_BUNDLE_ELEM_SETREG(kr3, AR_KR3), - XEN_PATCH_BUNDLE_ELEM_SETREG(kr4, AR_KR4), - XEN_PATCH_BUNDLE_ELEM_SETREG(kr5, AR_KR5), - XEN_PATCH_BUNDLE_ELEM_SETREG(kr6, AR_KR6), - XEN_PATCH_BUNDLE_ELEM_SETREG(kr7, AR_KR7), - - XEN_PATCH_BUNDLE_ELEM_SETREG(eflag, AR_EFLAG), - XEN_PATCH_BUNDLE_ELEM_SETREG(tpr, CR_TPR), - __XEN_PATCH_BUNDLE_ELEM_SETREG(eoi, CR_EOI), - - XEN_PATCH_BUNDLE_ELEM_SETREG(itc, AR_ITC), - XEN_PATCH_BUNDLE_ELEM_SETREG(itm_with_offset, CR_ITM), -}; - -static unsigned long __init_or_module -xen_patch_bundle(void *sbundle, void *ebundle, unsigned long type) -{ - const unsigned long nelems = sizeof(xen_patch_bundle_elems) / - sizeof(xen_patch_bundle_elems[0]); - unsigned long used; - const struct paravirt_patch_bundle_elem *found; - - used = __paravirt_patch_apply_bundle(sbundle, ebundle, type, - xen_patch_bundle_elems, nelems, - &found); - - if (found == NULL) - /* fallback */ - return ia64_native_patch_bundle(sbundle, ebundle, type); - if (used == 0) - return used; - - /* relocation */ - switch (type) { - case PARAVIRT_PATCH_TYPE_INTRIN_LOCAL_IRQ_RESTORE: { - unsigned long reloc = - __xen_intrin_local_irq_restore_direct_reloc; - unsigned long reloc_offset = reloc - (unsigned long) - __xen_intrin_local_irq_restore_direct_start; - unsigned long tag = (unsigned long)sbundle + reloc_offset; - paravirt_patch_reloc_brl(tag, xen_check_events); - break; - } - default: - /* nothing */ - break; - } - return used; -} -#endif /* ASM_SUPPOTED */ - -const struct paravirt_patch_branch_target xen_branch_target[] -__initconst = { -#define PARAVIRT_BR_TARGET(name, type) \ - { \ - &xen_ ## name, \ - PARAVIRT_PATCH_TYPE_BR_ ## type, \ - } - PARAVIRT_BR_TARGET(switch_to, SWITCH_TO), - PARAVIRT_BR_TARGET(leave_syscall, LEAVE_SYSCALL), - PARAVIRT_BR_TARGET(work_processed_syscall, WORK_PROCESSED_SYSCALL), - PARAVIRT_BR_TARGET(leave_kernel, LEAVE_KERNEL), -}; - -static void __init -xen_patch_branch(unsigned long tag, unsigned long type) -{ - const unsigned long nelem = - sizeof(xen_branch_target) / sizeof(xen_branch_target[0]); - __paravirt_patch_apply_branch(tag, type, xen_branch_target, nelem); -} diff --git a/trunk/arch/x86/boot/memory.c b/trunk/arch/x86/boot/memory.c index 5054c2ddd1a0..8c3c25f35578 100644 --- a/trunk/arch/x86/boot/memory.c +++ b/trunk/arch/x86/boot/memory.c @@ -2,7 +2,6 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved - * Copyright 2009 Intel Corporation; author H. Peter Anvin * * This file is part of the Linux kernel, and is made available under * the terms of the GNU General Public License version 2. @@ -17,38 +16,24 @@ #define SMAP 0x534d4150 /* ASCII "SMAP" */ -struct e820_ext_entry { - struct e820entry std; - u32 ext_flags; -} __attribute__((packed)); - static int detect_memory_e820(void) { int count = 0; u32 next = 0; - u32 size, id, edi; + u32 size, id; u8 err; struct e820entry *desc = boot_params.e820_map; - static struct e820_ext_entry buf; /* static so it is zeroed */ - - /* - * Set this here so that if the BIOS doesn't change this field - * but still doesn't change %ecx, we're still okay... - */ - buf.ext_flags = 1; do { - size = sizeof buf; + size = sizeof(struct e820entry); - /* Important: %edx and %esi are clobbered by some BIOSes, - so they must be either used for the error output - or explicitly marked clobbered. Given that, assume there - is something out there clobbering %ebp and %edi, too. */ - asm("pushl %%ebp; int $0x15; popl %%ebp; setc %0" + /* Important: %edx is clobbered by some BIOSes, + so it must be either used for the error output + or explicitly marked clobbered. */ + asm("int $0x15; setc %0" : "=d" (err), "+b" (next), "=a" (id), "+c" (size), - "=D" (edi), "+m" (buf) - : "D" (&buf), "d" (SMAP), "a" (0xe820) - : "esi"); + "=m" (*desc) + : "D" (desc), "d" (SMAP), "a" (0xe820)); /* BIOSes which terminate the chain with CF = 1 as opposed to %ebx = 0 don't always report the SMAP signature on @@ -66,14 +51,8 @@ static int detect_memory_e820(void) break; } - /* ACPI 3.0 added the extended flags support. If bit 0 - in the extended flags is zero, we're supposed to simply - ignore the entry -- a backwards incompatible change! */ - if (size > 20 && !(buf.ext_flags & 1)) - continue; - - *desc++ = buf.std; count++; + desc++; } while (next && count < ARRAY_SIZE(boot_params.e820_map)); return boot_params.e820_entries = count; diff --git a/trunk/drivers/gpu/drm/drm_crtc_helper.c b/trunk/drivers/gpu/drm/drm_crtc_helper.c index a04639dc633d..1c3a8c557140 100644 --- a/trunk/drivers/gpu/drm/drm_crtc_helper.c +++ b/trunk/drivers/gpu/drm/drm_crtc_helper.c @@ -42,26 +42,6 @@ static struct drm_display_mode std_modes[] = { DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC) }, }; -static void drm_mode_validate_flag(struct drm_connector *connector, - int flags) -{ - struct drm_display_mode *mode, *t; - - if (flags == (DRM_MODE_FLAG_DBLSCAN | DRM_MODE_FLAG_INTERLACE)) - return; - - list_for_each_entry_safe(mode, t, &connector->modes, head) { - if ((mode->flags & DRM_MODE_FLAG_INTERLACE) && - !(flags & DRM_MODE_FLAG_INTERLACE)) - mode->status = MODE_NO_INTERLACE; - if ((mode->flags & DRM_MODE_FLAG_DBLSCAN) && - !(flags & DRM_MODE_FLAG_DBLSCAN)) - mode->status = MODE_NO_DBLESCAN; - } - - return; -} - /** * drm_helper_probe_connector_modes - get complete set of display modes * @dev: DRM device @@ -92,7 +72,6 @@ int drm_helper_probe_single_connector_modes(struct drm_connector *connector, struct drm_connector_helper_funcs *connector_funcs = connector->helper_private; int count = 0; - int mode_flags = 0; DRM_DEBUG("%s\n", drm_get_connector_name(connector)); /* set all modes to the unverified state */ @@ -117,13 +96,6 @@ int drm_helper_probe_single_connector_modes(struct drm_connector *connector, if (maxX && maxY) drm_mode_validate_size(dev, &connector->modes, maxX, maxY, 0); - - if (connector->interlace_allowed) - mode_flags |= DRM_MODE_FLAG_INTERLACE; - if (connector->doublescan_allowed) - mode_flags |= DRM_MODE_FLAG_DBLSCAN; - drm_mode_validate_flag(connector, mode_flags); - list_for_each_entry_safe(mode, t, &connector->modes, head) { if (mode->status == MODE_OK) mode->status = connector_funcs->mode_valid(connector, @@ -913,6 +885,7 @@ bool drm_helper_plugged_event(struct drm_device *dev) /** * drm_initial_config - setup a sane initial connector configuration * @dev: DRM device + * @can_grow: this configuration is growable * * LOCKING: * Called at init time, must take mode config lock. @@ -924,7 +897,7 @@ bool drm_helper_plugged_event(struct drm_device *dev) * RETURNS: * Zero if everything went ok, nonzero otherwise. */ -bool drm_helper_initial_config(struct drm_device *dev) +bool drm_helper_initial_config(struct drm_device *dev, bool can_grow) { struct drm_connector *connector; int count = 0; diff --git a/trunk/drivers/gpu/drm/drm_edid.c b/trunk/drivers/gpu/drm/drm_edid.c index ca9c61656714..c67400067b85 100644 --- a/trunk/drivers/gpu/drm/drm_edid.c +++ b/trunk/drivers/gpu/drm/drm_edid.c @@ -125,8 +125,10 @@ static bool edid_is_valid(struct edid *edid) DRM_ERROR("EDID has major version %d, instead of 1\n", edid->version); goto bad; } - if (edid->revision > 4) - DRM_DEBUG("EDID minor > 4, assuming backward compatibility\n"); + if (edid->revision > 3) { + DRM_ERROR("EDID has minor version %d, which is not between 0-3\n", edid->revision); + goto bad; + } for (i = 0; i < EDID_LENGTH; i++) csum += raw_edid[i]; @@ -160,7 +162,7 @@ static bool edid_vendor(struct edid *edid, char *vendor) edid_vendor[0] = ((edid->mfg_id[0] & 0x7c) >> 2) + '@'; edid_vendor[1] = (((edid->mfg_id[0] & 0x3) << 3) | ((edid->mfg_id[1] & 0xe0) >> 5)) + '@'; - edid_vendor[2] = (edid->mfg_id[1] & 0x1f) + '@'; + edid_vendor[2] = (edid->mfg_id[2] & 0x1f) + '@'; return !strncmp(edid_vendor, vendor, 3); } diff --git a/trunk/drivers/gpu/drm/i915/i915_dma.c b/trunk/drivers/gpu/drm/i915/i915_dma.c index 9648d79fe36d..85549f615b1f 100644 --- a/trunk/drivers/gpu/drm/i915/i915_dma.c +++ b/trunk/drivers/gpu/drm/i915/i915_dma.c @@ -1049,7 +1049,7 @@ static int i915_load_modeset_init(struct drm_device *dev) intel_modeset_init(dev); - drm_helper_initial_config(dev); + drm_helper_initial_config(dev, false); return 0; diff --git a/trunk/drivers/gpu/drm/i915/i915_gem.c b/trunk/drivers/gpu/drm/i915/i915_gem.c index e0389ad1477d..0abccb761121 100644 --- a/trunk/drivers/gpu/drm/i915/i915_gem.c +++ b/trunk/drivers/gpu/drm/i915/i915_gem.c @@ -1990,20 +1990,23 @@ static void i830_write_fence_reg(struct drm_i915_fence_reg *reg) int regnum = obj_priv->fence_reg; uint32_t val; uint32_t pitch_val; + uint32_t fence_size_bits; - if ((obj_priv->gtt_offset & ~I915_FENCE_START_MASK) || + if ((obj_priv->gtt_offset & ~I830_FENCE_START_MASK) || (obj_priv->gtt_offset & (obj->size - 1))) { - WARN(1, "%s: object 0x%08x not 1M or size aligned\n", + WARN(1, "%s: object 0x%08x not 512K or size aligned\n", __func__, obj_priv->gtt_offset); return; } pitch_val = (obj_priv->stride / 128) - 1; - + WARN_ON(pitch_val & ~0x0000000f); val = obj_priv->gtt_offset; if (obj_priv->tiling_mode == I915_TILING_Y) val |= 1 << I830_FENCE_TILING_Y_SHIFT; - val |= I830_FENCE_SIZE_BITS(obj->size); + fence_size_bits = I830_FENCE_SIZE_BITS(obj->size); + WARN_ON(fence_size_bits & ~0x00000f00); + val |= fence_size_bits; val |= pitch_val << I830_FENCE_PITCH_SHIFT; val |= I830_FENCE_REG_VALID; @@ -2194,7 +2197,7 @@ i915_gem_object_bind_to_gtt(struct drm_gem_object *obj, unsigned alignment) return -EBUSY; if (alignment == 0) alignment = i915_gem_get_gtt_alignment(obj); - if (alignment & (PAGE_SIZE - 1)) { + if (alignment & (i915_gem_get_gtt_alignment(obj) - 1)) { DRM_ERROR("Invalid object alignment requested %u\n", alignment); return -EINVAL; } diff --git a/trunk/drivers/gpu/drm/i915/i915_gem_tiling.c b/trunk/drivers/gpu/drm/i915/i915_gem_tiling.c index 4cce1aef438e..6be3f927c86a 100644 --- a/trunk/drivers/gpu/drm/i915/i915_gem_tiling.c +++ b/trunk/drivers/gpu/drm/i915/i915_gem_tiling.c @@ -216,6 +216,22 @@ i915_tiling_ok(struct drm_device *dev, int stride, int size, int tiling_mode) else tile_width = 512; + /* check maximum stride & object size */ + if (IS_I965G(dev)) { + /* i965 stores the end address of the gtt mapping in the fence + * reg, so dont bother to check the size */ + if (stride / 128 > I965_FENCE_MAX_PITCH_VAL) + return false; + } else if (IS_I9XX(dev)) { + if (stride / tile_width > I830_FENCE_MAX_PITCH_VAL || + size > (I830_FENCE_MAX_SIZE_VAL << 20)) + return false; + } else { + if (stride / 128 > I830_FENCE_MAX_PITCH_VAL || + size > (I830_FENCE_MAX_SIZE_VAL << 19)) + return false; + } + /* 965+ just needs multiples of tile width */ if (IS_I965G(dev)) { if (stride & (tile_width - 1)) diff --git a/trunk/drivers/gpu/drm/i915/i915_reg.h b/trunk/drivers/gpu/drm/i915/i915_reg.h index 377cc588f5e9..83357b09e546 100644 --- a/trunk/drivers/gpu/drm/i915/i915_reg.h +++ b/trunk/drivers/gpu/drm/i915/i915_reg.h @@ -190,6 +190,8 @@ #define I830_FENCE_SIZE_BITS(size) ((ffs((size) >> 19) - 1) << 8) #define I830_FENCE_PITCH_SHIFT 4 #define I830_FENCE_REG_VALID (1<<0) +#define I830_FENCE_MAX_PITCH_VAL 0x10 +#define I830_FENCE_MAX_SIZE_VAL (1<<8) #define I915_FENCE_START_MASK 0x0ff00000 #define I915_FENCE_SIZE_BITS(size) ((ffs((size) >> 20) - 1) << 8) @@ -198,6 +200,7 @@ #define I965_FENCE_PITCH_SHIFT 2 #define I965_FENCE_TILING_Y_SHIFT 1 #define I965_FENCE_REG_VALID (1<<0) +#define I965_FENCE_MAX_PITCH_VAL 0x0400 /* * Instruction and interrupt control regs diff --git a/trunk/drivers/s390/net/qeth_core_offl.c b/trunk/drivers/s390/net/qeth_core_offl.c new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/trunk/drivers/s390/net/qeth_core_offl.h b/trunk/drivers/s390/net/qeth_core_offl.h new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/trunk/drivers/serial/serial_core.c b/trunk/drivers/serial/serial_core.c index b0bb29d804ae..bf3c0e32a334 100644 --- a/trunk/drivers/serial/serial_core.c +++ b/trunk/drivers/serial/serial_core.c @@ -1765,7 +1765,7 @@ static void uart_line_info(struct seq_file *m, struct uart_driver *drv, int i) static int uart_proc_show(struct seq_file *m, void *v) { - struct tty_driver *ttydrv = m->private; + struct tty_driver *ttydrv = v; struct uart_driver *drv = ttydrv->driver_state; int i; diff --git a/trunk/fs/btrfs/Makefile b/trunk/fs/btrfs/Makefile index 9adf5e4f7e96..d2cf5a54a4b8 100644 --- a/trunk/fs/btrfs/Makefile +++ b/trunk/fs/btrfs/Makefile @@ -8,7 +8,7 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \ - compression.o delayed-ref.o + compression.o else # Normal Makefile diff --git a/trunk/fs/btrfs/btrfs_inode.h b/trunk/fs/btrfs/btrfs_inode.h index b30986f00b9d..72677ce2b74f 100644 --- a/trunk/fs/btrfs/btrfs_inode.h +++ b/trunk/fs/btrfs/btrfs_inode.h @@ -66,12 +66,6 @@ struct btrfs_inode { */ struct list_head delalloc_inodes; - /* - * list for tracking inodes that must be sent to disk before a - * rename or truncate commit - */ - struct list_head ordered_operations; - /* the space_info for where this inode's data allocations are done */ struct btrfs_space_info *space_info; @@ -92,6 +86,12 @@ struct btrfs_inode { */ u64 logged_trans; + /* + * trans that last made a change that should be fully fsync'd. This + * gets reset to zero each time the inode is logged + */ + u64 log_dirty_trans; + /* total number of bytes pending delalloc, used by stat to calc the * real block usage of the file */ @@ -121,25 +121,6 @@ struct btrfs_inode { /* the start of block group preferred for allocations. */ u64 block_group; - /* the fsync log has some corner cases that mean we have to check - * directories to see if any unlinks have been done before - * the directory was logged. See tree-log.c for all the - * details - */ - u64 last_unlink_trans; - - /* - * ordered_data_close is set by truncate when a file that used - * to have good data has been truncated to zero. When it is set - * the btrfs file release call will add this inode to the - * ordered operations list so that we make sure to flush out any - * new data the application may have written before commit. - * - * yes, its silly to have a single bitflag, but we might grow more - * of these. - */ - unsigned ordered_data_close:1; - struct inode vfs_inode; }; diff --git a/trunk/fs/btrfs/ctree.c b/trunk/fs/btrfs/ctree.c index dbb724124633..37f31b5529aa 100644 --- a/trunk/fs/btrfs/ctree.c +++ b/trunk/fs/btrfs/ctree.c @@ -254,13 +254,18 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, * empty_size -- a hint that you plan on doing more cow. This is the size in * bytes the allocator should try to find free next to the block it returns. * This is just a hint and may be ignored by the allocator. + * + * prealloc_dest -- if you have already reserved a destination for the cow, + * this uses that block instead of allocating a new one. + * btrfs_alloc_reserved_extent is used to finish the allocation. */ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, struct extent_buffer **cow_ret, - u64 search_start, u64 empty_size) + u64 search_start, u64 empty_size, + u64 prealloc_dest) { u64 parent_start; struct extent_buffer *cow; @@ -286,10 +291,26 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, level = btrfs_header_level(buf); nritems = btrfs_header_nritems(buf); - cow = btrfs_alloc_free_block(trans, root, buf->len, - parent_start, root->root_key.objectid, - trans->transid, level, - search_start, empty_size); + if (prealloc_dest) { + struct btrfs_key ins; + + ins.objectid = prealloc_dest; + ins.offset = buf->len; + ins.type = BTRFS_EXTENT_ITEM_KEY; + + ret = btrfs_alloc_reserved_extent(trans, root, parent_start, + root->root_key.objectid, + trans->transid, level, &ins); + BUG_ON(ret); + cow = btrfs_init_new_buffer(trans, root, prealloc_dest, + buf->len, level); + } else { + cow = btrfs_alloc_free_block(trans, root, buf->len, + parent_start, + root->root_key.objectid, + trans->transid, level, + search_start, empty_size); + } if (IS_ERR(cow)) return PTR_ERR(cow); @@ -392,7 +413,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, - struct extent_buffer **cow_ret) + struct extent_buffer **cow_ret, u64 prealloc_dest) { u64 search_start; int ret; @@ -415,6 +436,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, btrfs_header_owner(buf) == root->root_key.objectid && !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { *cow_ret = buf; + WARN_ON(prealloc_dest); return 0; } @@ -425,7 +447,8 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, btrfs_set_lock_blocking(buf); ret = __btrfs_cow_block(trans, root, buf, parent, - parent_slot, cow_ret, search_start, 0); + parent_slot, cow_ret, search_start, 0, + prealloc_dest); return ret; } @@ -594,7 +617,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, err = __btrfs_cow_block(trans, root, cur, parent, i, &cur, search_start, min(16 * blocksize, - (end_slot - i) * blocksize)); + (end_slot - i) * blocksize), 0); if (err) { btrfs_tree_unlock(cur); free_extent_buffer(cur); @@ -914,7 +937,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, BUG_ON(!child); btrfs_tree_lock(child); btrfs_set_lock_blocking(child); - ret = btrfs_cow_block(trans, root, child, mid, 0, &child); + ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0); BUG_ON(ret); spin_lock(&root->node_lock); @@ -922,7 +945,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, spin_unlock(&root->node_lock); ret = btrfs_update_extent_ref(trans, root, child->start, - child->len, mid->start, child->start, root->root_key.objectid, trans->transid, level - 1); @@ -949,10 +971,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, BTRFS_NODEPTRS_PER_BLOCK(root) / 4) return 0; - if (trans->transaction->delayed_refs.flushing && - btrfs_header_nritems(mid) > 2) - return 0; - if (btrfs_header_nritems(mid) < 2) err_on_enospc = 1; @@ -961,7 +979,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_tree_lock(left); btrfs_set_lock_blocking(left); wret = btrfs_cow_block(trans, root, left, - parent, pslot - 1, &left); + parent, pslot - 1, &left, 0); if (wret) { ret = wret; goto enospc; @@ -972,7 +990,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_tree_lock(right); btrfs_set_lock_blocking(right); wret = btrfs_cow_block(trans, root, right, - parent, pslot + 1, &right); + parent, pslot + 1, &right, 0); if (wret) { ret = wret; goto enospc; @@ -1153,7 +1171,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, wret = 1; } else { ret = btrfs_cow_block(trans, root, left, parent, - pslot - 1, &left); + pslot - 1, &left, 0); if (ret) wret = 1; else { @@ -1204,7 +1222,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, } else { ret = btrfs_cow_block(trans, root, right, parent, pslot + 1, - &right); + &right, 0); if (ret) wret = 1; else { @@ -1474,6 +1492,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root u8 lowest_level = 0; u64 blocknr; u64 gen; + struct btrfs_key prealloc_block; lowest_level = p->lowest_level; WARN_ON(lowest_level && ins_len > 0); @@ -1482,6 +1501,8 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root if (ins_len < 0) lowest_unlock = 2; + prealloc_block.objectid = 0; + again: if (p->skip_locking) b = btrfs_root_node(root); @@ -1508,11 +1529,44 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) { goto cow_done; } + + /* ok, we have to cow, is our old prealloc the right + * size? + */ + if (prealloc_block.objectid && + prealloc_block.offset != b->len) { + btrfs_release_path(root, p); + btrfs_free_reserved_extent(root, + prealloc_block.objectid, + prealloc_block.offset); + prealloc_block.objectid = 0; + goto again; + } + + /* + * for higher level blocks, try not to allocate blocks + * with the block and the parent locks held. + */ + if (level > 0 && !prealloc_block.objectid) { + u32 size = b->len; + u64 hint = b->start; + + btrfs_release_path(root, p); + ret = btrfs_reserve_extent(trans, root, + size, size, 0, + hint, (u64)-1, + &prealloc_block, 0); + BUG_ON(ret); + goto again; + } + btrfs_set_path_blocking(p); wret = btrfs_cow_block(trans, root, b, p->nodes[level + 1], - p->slots[level + 1], &b); + p->slots[level + 1], + &b, prealloc_block.objectid); + prealloc_block.objectid = 0; if (wret) { free_extent_buffer(b); ret = wret; @@ -1688,8 +1742,12 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root * we don't really know what they plan on doing with the path * from here on, so for now just mark it as blocking */ - if (!p->leave_spinning) - btrfs_set_path_blocking(p); + btrfs_set_path_blocking(p); + if (prealloc_block.objectid) { + btrfs_free_reserved_extent(root, + prealloc_block.objectid, + prealloc_block.offset); + } return ret; } @@ -1710,7 +1768,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans, int ret; eb = btrfs_lock_root_node(root); - ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb); + ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0); BUG_ON(ret); btrfs_set_lock_blocking(eb); @@ -1768,7 +1826,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans, } ret = btrfs_cow_block(trans, root, eb, parent, slot, - &eb); + &eb, 0); BUG_ON(ret); if (root->root_key.objectid == @@ -2081,7 +2139,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, spin_unlock(&root->node_lock); ret = btrfs_update_extent_ref(trans, root, lower->start, - lower->len, lower->start, c->start, + lower->start, c->start, root->root_key.objectid, trans->transid, level - 1); BUG_ON(ret); @@ -2163,7 +2221,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, ret = insert_new_root(trans, root, path, level + 1); if (ret) return ret; - } else if (!trans->transaction->delayed_refs.flushing) { + } else { ret = push_nodes_for_insert(trans, root, path, level); c = path->nodes[level]; if (!ret && btrfs_header_nritems(c) < @@ -2271,27 +2329,66 @@ noinline int btrfs_leaf_free_space(struct btrfs_root *root, return ret; } -static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - int data_size, int empty, - struct extent_buffer *right, - int free_space, u32 left_nritems) +/* + * push some data in the path leaf to the right, trying to free up at + * least data_size bytes. returns zero if the push worked, nonzero otherwise + * + * returns 1 if the push failed because the other node didn't have enough + * room, 0 if everything worked out and < 0 if there were major errors. + */ +static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, int data_size, + int empty) { struct extent_buffer *left = path->nodes[0]; - struct extent_buffer *upper = path->nodes[1]; + struct extent_buffer *right; + struct extent_buffer *upper; struct btrfs_disk_key disk_key; int slot; u32 i; + int free_space; int push_space = 0; int push_items = 0; struct btrfs_item *item; + u32 left_nritems; u32 nr; u32 right_nritems; u32 data_end; u32 this_item_size; int ret; + slot = path->slots[1]; + if (!path->nodes[1]) + return 1; + + upper = path->nodes[1]; + if (slot >= btrfs_header_nritems(upper) - 1) + return 1; + + btrfs_assert_tree_locked(path->nodes[1]); + + right = read_node_slot(root, upper, slot + 1); + btrfs_tree_lock(right); + btrfs_set_lock_blocking(right); + + free_space = btrfs_leaf_free_space(root, right); + if (free_space < data_size) + goto out_unlock; + + /* cow and double check */ + ret = btrfs_cow_block(trans, root, right, upper, + slot + 1, &right, 0); + if (ret) + goto out_unlock; + + free_space = btrfs_leaf_free_space(root, right); + if (free_space < data_size) + goto out_unlock; + + left_nritems = btrfs_header_nritems(left); + if (left_nritems == 0) + goto out_unlock; + if (empty) nr = 0; else @@ -2300,7 +2397,6 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, if (path->slots[0] >= left_nritems) push_space += data_size; - slot = path->slots[1]; i = left_nritems - 1; while (i >= nr) { item = btrfs_item_nr(left, i); @@ -2431,83 +2527,25 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, return 1; } -/* - * push some data in the path leaf to the right, trying to free up at - * least data_size bytes. returns zero if the push worked, nonzero otherwise - * - * returns 1 if the push failed because the other node didn't have enough - * room, 0 if everything worked out and < 0 if there were major errors. - */ -static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_path *path, int data_size, - int empty) -{ - struct extent_buffer *left = path->nodes[0]; - struct extent_buffer *right; - struct extent_buffer *upper; - int slot; - int free_space; - u32 left_nritems; - int ret; - - if (!path->nodes[1]) - return 1; - - slot = path->slots[1]; - upper = path->nodes[1]; - if (slot >= btrfs_header_nritems(upper) - 1) - return 1; - - btrfs_assert_tree_locked(path->nodes[1]); - - right = read_node_slot(root, upper, slot + 1); - btrfs_tree_lock(right); - btrfs_set_lock_blocking(right); - - free_space = btrfs_leaf_free_space(root, right); - if (free_space < data_size) - goto out_unlock; - - /* cow and double check */ - ret = btrfs_cow_block(trans, root, right, upper, - slot + 1, &right); - if (ret) - goto out_unlock; - - free_space = btrfs_leaf_free_space(root, right); - if (free_space < data_size) - goto out_unlock; - - left_nritems = btrfs_header_nritems(left); - if (left_nritems == 0) - goto out_unlock; - - return __push_leaf_right(trans, root, path, data_size, empty, - right, free_space, left_nritems); -out_unlock: - btrfs_tree_unlock(right); - free_extent_buffer(right); - return 1; -} - /* * push some data in the path leaf to the left, trying to free up at * least data_size bytes. returns zero if the push worked, nonzero otherwise */ -static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, int data_size, - int empty, struct extent_buffer *left, - int free_space, int right_nritems) +static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, int data_size, + int empty) { struct btrfs_disk_key disk_key; struct extent_buffer *right = path->nodes[0]; + struct extent_buffer *left; int slot; int i; + int free_space; int push_space = 0; int push_items = 0; struct btrfs_item *item; u32 old_left_nritems; + u32 right_nritems; u32 nr; int ret = 0; int wret; @@ -2515,6 +2553,41 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, u32 old_left_item_size; slot = path->slots[1]; + if (slot == 0) + return 1; + if (!path->nodes[1]) + return 1; + + right_nritems = btrfs_header_nritems(right); + if (right_nritems == 0) + return 1; + + btrfs_assert_tree_locked(path->nodes[1]); + + left = read_node_slot(root, path->nodes[1], slot - 1); + btrfs_tree_lock(left); + btrfs_set_lock_blocking(left); + + free_space = btrfs_leaf_free_space(root, left); + if (free_space < data_size) { + ret = 1; + goto out; + } + + /* cow and double check */ + ret = btrfs_cow_block(trans, root, left, + path->nodes[1], slot - 1, &left, 0); + if (ret) { + /* we hit -ENOSPC, but it isn't fatal here */ + ret = 1; + goto out; + } + + free_space = btrfs_leaf_free_space(root, left); + if (free_space < data_size) { + ret = 1; + goto out; + } if (empty) nr = right_nritems; @@ -2681,154 +2754,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, return ret; } -/* - * push some data in the path leaf to the left, trying to free up at - * least data_size bytes. returns zero if the push worked, nonzero otherwise - */ -static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_path *path, int data_size, - int empty) -{ - struct extent_buffer *right = path->nodes[0]; - struct extent_buffer *left; - int slot; - int free_space; - u32 right_nritems; - int ret = 0; - - slot = path->slots[1]; - if (slot == 0) - return 1; - if (!path->nodes[1]) - return 1; - - right_nritems = btrfs_header_nritems(right); - if (right_nritems == 0) - return 1; - - btrfs_assert_tree_locked(path->nodes[1]); - - left = read_node_slot(root, path->nodes[1], slot - 1); - btrfs_tree_lock(left); - btrfs_set_lock_blocking(left); - - free_space = btrfs_leaf_free_space(root, left); - if (free_space < data_size) { - ret = 1; - goto out; - } - - /* cow and double check */ - ret = btrfs_cow_block(trans, root, left, - path->nodes[1], slot - 1, &left); - if (ret) { - /* we hit -ENOSPC, but it isn't fatal here */ - ret = 1; - goto out; - } - - free_space = btrfs_leaf_free_space(root, left); - if (free_space < data_size) { - ret = 1; - goto out; - } - - return __push_leaf_left(trans, root, path, data_size, - empty, left, free_space, right_nritems); -out: - btrfs_tree_unlock(left); - free_extent_buffer(left); - return ret; -} - -/* - * split the path's leaf in two, making sure there is at least data_size - * available for the resulting leaf level of the path. - * - * returns 0 if all went well and < 0 on failure. - */ -static noinline int copy_for_split(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct extent_buffer *l, - struct extent_buffer *right, - int slot, int mid, int nritems) -{ - int data_copy_size; - int rt_data_off; - int i; - int ret = 0; - int wret; - struct btrfs_disk_key disk_key; - - nritems = nritems - mid; - btrfs_set_header_nritems(right, nritems); - data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l); - - copy_extent_buffer(right, l, btrfs_item_nr_offset(0), - btrfs_item_nr_offset(mid), - nritems * sizeof(struct btrfs_item)); - - copy_extent_buffer(right, l, - btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) - - data_copy_size, btrfs_leaf_data(l) + - leaf_data_end(root, l), data_copy_size); - - rt_data_off = BTRFS_LEAF_DATA_SIZE(root) - - btrfs_item_end_nr(l, mid); - - for (i = 0; i < nritems; i++) { - struct btrfs_item *item = btrfs_item_nr(right, i); - u32 ioff; - - if (!right->map_token) { - map_extent_buffer(right, (unsigned long)item, - sizeof(struct btrfs_item), - &right->map_token, &right->kaddr, - &right->map_start, &right->map_len, - KM_USER1); - } - - ioff = btrfs_item_offset(right, item); - btrfs_set_item_offset(right, item, ioff + rt_data_off); - } - - if (right->map_token) { - unmap_extent_buffer(right, right->map_token, KM_USER1); - right->map_token = NULL; - } - - btrfs_set_header_nritems(l, mid); - ret = 0; - btrfs_item_key(right, &disk_key, 0); - wret = insert_ptr(trans, root, path, &disk_key, right->start, - path->slots[1] + 1, 1); - if (wret) - ret = wret; - - btrfs_mark_buffer_dirty(right); - btrfs_mark_buffer_dirty(l); - BUG_ON(path->slots[0] != slot); - - ret = btrfs_update_ref(trans, root, l, right, 0, nritems); - BUG_ON(ret); - - if (mid <= slot) { - btrfs_tree_unlock(path->nodes[0]); - free_extent_buffer(path->nodes[0]); - path->nodes[0] = right; - path->slots[0] -= mid; - path->slots[1] += 1; - } else { - btrfs_tree_unlock(right); - free_extent_buffer(right); - } - - BUG_ON(path->slots[0] < 0); - - return ret; -} - /* * split the path's leaf in two, making sure there is at least data_size * available for the resulting leaf level of the path. @@ -2846,14 +2771,17 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, int mid; int slot; struct extent_buffer *right; + int data_copy_size; + int rt_data_off; + int i; int ret = 0; int wret; int double_split; int num_doubles = 0; + struct btrfs_disk_key disk_key; /* first try to make some room by pushing left and right */ - if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY && - !trans->transaction->delayed_refs.flushing) { + if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) { wret = push_leaf_right(trans, root, path, data_size, 0); if (wret < 0) return wret; @@ -2902,14 +2830,11 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, write_extent_buffer(right, root->fs_info->chunk_tree_uuid, (unsigned long)btrfs_header_chunk_tree_uuid(right), BTRFS_UUID_SIZE); - if (mid <= slot) { if (nritems == 1 || leaf_space_used(l, mid, nritems - mid) + data_size > BTRFS_LEAF_DATA_SIZE(root)) { if (slot >= nritems) { - struct btrfs_disk_key disk_key; - btrfs_cpu_key_to_disk(&disk_key, ins_key); btrfs_set_header_nritems(right, 0); wret = insert_ptr(trans, root, path, @@ -2937,8 +2862,6 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, if (leaf_space_used(l, 0, mid) + data_size > BTRFS_LEAF_DATA_SIZE(root)) { if (!extend && data_size && slot == 0) { - struct btrfs_disk_key disk_key; - btrfs_cpu_key_to_disk(&disk_key, ins_key); btrfs_set_header_nritems(right, 0); wret = insert_ptr(trans, root, path, @@ -2971,16 +2894,76 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, } } } + nritems = nritems - mid; + btrfs_set_header_nritems(right, nritems); + data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l); + + copy_extent_buffer(right, l, btrfs_item_nr_offset(0), + btrfs_item_nr_offset(mid), + nritems * sizeof(struct btrfs_item)); + + copy_extent_buffer(right, l, + btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) - + data_copy_size, btrfs_leaf_data(l) + + leaf_data_end(root, l), data_copy_size); + + rt_data_off = BTRFS_LEAF_DATA_SIZE(root) - + btrfs_item_end_nr(l, mid); + + for (i = 0; i < nritems; i++) { + struct btrfs_item *item = btrfs_item_nr(right, i); + u32 ioff; + + if (!right->map_token) { + map_extent_buffer(right, (unsigned long)item, + sizeof(struct btrfs_item), + &right->map_token, &right->kaddr, + &right->map_start, &right->map_len, + KM_USER1); + } + + ioff = btrfs_item_offset(right, item); + btrfs_set_item_offset(right, item, ioff + rt_data_off); + } + + if (right->map_token) { + unmap_extent_buffer(right, right->map_token, KM_USER1); + right->map_token = NULL; + } + + btrfs_set_header_nritems(l, mid); + ret = 0; + btrfs_item_key(right, &disk_key, 0); + wret = insert_ptr(trans, root, path, &disk_key, right->start, + path->slots[1] + 1, 1); + if (wret) + ret = wret; + + btrfs_mark_buffer_dirty(right); + btrfs_mark_buffer_dirty(l); + BUG_ON(path->slots[0] != slot); - ret = copy_for_split(trans, root, path, l, right, slot, mid, nritems); + ret = btrfs_update_ref(trans, root, l, right, 0, nritems); BUG_ON(ret); + if (mid <= slot) { + btrfs_tree_unlock(path->nodes[0]); + free_extent_buffer(path->nodes[0]); + path->nodes[0] = right; + path->slots[0] -= mid; + path->slots[1] += 1; + } else { + btrfs_tree_unlock(right); + free_extent_buffer(right); + } + + BUG_ON(path->slots[0] < 0); + if (double_split) { BUG_ON(num_doubles != 0); num_doubles++; goto again; } - return ret; } @@ -3038,27 +3021,26 @@ int btrfs_split_item(struct btrfs_trans_handle *trans, return -EAGAIN; } - btrfs_set_path_blocking(path); ret = split_leaf(trans, root, &orig_key, path, sizeof(struct btrfs_item), 1); path->keep_locks = 0; BUG_ON(ret); - btrfs_unlock_up_safe(path, 1); - leaf = path->nodes[0]; - BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item)); - -split: /* * make sure any changes to the path from split_leaf leave it * in a blocking state */ btrfs_set_path_blocking(path); + leaf = path->nodes[0]; + BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item)); + +split: item = btrfs_item_nr(leaf, path->slots[0]); orig_offset = btrfs_item_offset(leaf, item); item_size = btrfs_item_size(leaf, item); + buf = kmalloc(item_size, GFP_NOFS); read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, path->slots[0]), item_size); @@ -3463,27 +3445,39 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans, } /* - * this is a helper for btrfs_insert_empty_items, the main goal here is - * to save stack depth by doing the bulk of the work in a function - * that doesn't call btrfs_search_slot + * Given a key and some data, insert items into the tree. + * This does all the path init required, making room in the tree if needed. */ -static noinline_for_stack int -setup_items_for_insert(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *cpu_key, u32 *data_size, - u32 total_data, u32 total_size, int nr) +int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *cpu_key, u32 *data_size, + int nr) { + struct extent_buffer *leaf; struct btrfs_item *item; + int ret = 0; + int slot; + int slot_orig; int i; u32 nritems; + u32 total_size = 0; + u32 total_data = 0; unsigned int data_end; struct btrfs_disk_key disk_key; - int ret; - struct extent_buffer *leaf; - int slot; + for (i = 0; i < nr; i++) + total_data += data_size[i]; + + total_size = total_data + (nr * sizeof(struct btrfs_item)); + ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1); + if (ret == 0) + return -EEXIST; + if (ret < 0) + goto out; + + slot_orig = path->slots[0]; leaf = path->nodes[0]; - slot = path->slots[0]; nritems = btrfs_header_nritems(leaf); data_end = leaf_data_end(root, leaf); @@ -3495,6 +3489,9 @@ setup_items_for_insert(struct btrfs_trans_handle *trans, BUG(); } + slot = path->slots[0]; + BUG_ON(slot < 0); + if (slot != nritems) { unsigned int old_data = btrfs_item_end_nr(leaf, slot); @@ -3550,60 +3547,21 @@ setup_items_for_insert(struct btrfs_trans_handle *trans, data_end -= data_size[i]; btrfs_set_item_size(leaf, item, data_size[i]); } - btrfs_set_header_nritems(leaf, nritems + nr); + btrfs_mark_buffer_dirty(leaf); ret = 0; if (slot == 0) { - struct btrfs_disk_key disk_key; btrfs_cpu_key_to_disk(&disk_key, cpu_key); ret = fixup_low_keys(trans, root, path, &disk_key, 1); } - btrfs_unlock_up_safe(path, 1); - btrfs_mark_buffer_dirty(leaf); if (btrfs_leaf_free_space(root, leaf) < 0) { btrfs_print_leaf(root, leaf); BUG(); } - return ret; -} - -/* - * Given a key and some data, insert items into the tree. - * This does all the path init required, making room in the tree if needed. - */ -int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_key *cpu_key, u32 *data_size, - int nr) -{ - struct extent_buffer *leaf; - int ret = 0; - int slot; - int i; - u32 total_size = 0; - u32 total_data = 0; - - for (i = 0; i < nr; i++) - total_data += data_size[i]; - - total_size = total_data + (nr * sizeof(struct btrfs_item)); - ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1); - if (ret == 0) - return -EEXIST; - if (ret < 0) - goto out; - - leaf = path->nodes[0]; - slot = path->slots[0]; - BUG_ON(slot < 0); - - ret = setup_items_for_insert(trans, root, path, cpu_key, data_size, - total_data, total_size, nr); - out: + btrfs_unlock_up_safe(path, 1); return ret; } @@ -3791,8 +3749,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, } /* delete the leaf if it is mostly empty */ - if (used < BTRFS_LEAF_DATA_SIZE(root) / 4 && - !trans->transaction->delayed_refs.flushing) { + if (used < BTRFS_LEAF_DATA_SIZE(root) / 4) { /* push_leaf_left fixes the path. * make sure the path still points to our leaf * for possible call to del_ptr below @@ -3800,7 +3757,6 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, slot = path->slots[1]; extent_buffer_get(leaf); - btrfs_set_path_blocking(path); wret = push_leaf_left(trans, root, path, 1, 1); if (wret < 0 && wret != -ENOSPC) ret = wret; diff --git a/trunk/fs/btrfs/ctree.h b/trunk/fs/btrfs/ctree.h index 9417713542a2..7dd1b6d0bf32 100644 --- a/trunk/fs/btrfs/ctree.h +++ b/trunk/fs/btrfs/ctree.h @@ -45,13 +45,6 @@ struct btrfs_ordered_sum; #define BTRFS_MAX_LEVEL 8 -/* - * files bigger than this get some pre-flushing when they are added - * to the ordered operations list. That way we limit the total - * work done by the commit - */ -#define BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT (8 * 1024 * 1024) - /* holds pointers to all of the tree roots */ #define BTRFS_ROOT_TREE_OBJECTID 1ULL @@ -408,16 +401,15 @@ struct btrfs_path { int locks[BTRFS_MAX_LEVEL]; int reada; /* keep some upper locks as we walk down */ + int keep_locks; + int skip_locking; int lowest_level; /* * set by btrfs_split_item, tells search_slot to keep all locks * and to force calls to keep space in the nodes */ - unsigned int search_for_split:1; - unsigned int keep_locks:1; - unsigned int skip_locking:1; - unsigned int leave_spinning:1; + int search_for_split; }; /* @@ -696,18 +688,15 @@ struct btrfs_fs_info { struct rb_root block_group_cache_tree; struct extent_io_tree pinned_extents; + struct extent_io_tree pending_del; + struct extent_io_tree extent_ins; /* logical->physical extent mapping */ struct btrfs_mapping_tree mapping_tree; u64 generation; u64 last_trans_committed; - - /* - * this is updated to the current trans every time a full commit - * is required instead of the faster short fsync log commits - */ - u64 last_trans_log_full_commit; + u64 last_trans_new_blockgroup; u64 open_ioctl_trans; unsigned long mount_opt; u64 max_extent; @@ -728,21 +717,12 @@ struct btrfs_fs_info { struct mutex tree_log_mutex; struct mutex transaction_kthread_mutex; struct mutex cleaner_mutex; + struct mutex extent_ins_mutex; struct mutex pinned_mutex; struct mutex chunk_mutex; struct mutex drop_mutex; struct mutex volume_mutex; struct mutex tree_reloc_mutex; - - /* - * this protects the ordered operations list only while we are - * processing all of the entries on it. This way we make - * sure the commit code doesn't find the list temporarily empty - * because another function happens to be doing non-waiting preflush - * before jumping into the main commit. - */ - struct mutex ordered_operations_mutex; - struct list_head trans_list; struct list_head hashers; struct list_head dead_roots; @@ -757,28 +737,9 @@ struct btrfs_fs_info { * ordered extents */ spinlock_t ordered_extent_lock; - - /* - * all of the data=ordered extents pending writeback - * these can span multiple transactions and basically include - * every dirty data page that isn't from nodatacow - */ struct list_head ordered_extents; - - /* - * all of the inodes that have delalloc bytes. It is possible for - * this list to be empty even when there is still dirty data=ordered - * extents waiting to finish IO. - */ struct list_head delalloc_inodes; - /* - * special rename and truncate targets that must be on disk before - * we're allowed to commit. This is basically the ext3 style - * data=ordered list. - */ - struct list_head ordered_operations; - /* * there is a pool of worker threads for checksumming during writes * and a pool for checksumming after reads. This is because readers @@ -820,11 +781,6 @@ struct btrfs_fs_info { atomic_t throttle_gen; u64 total_pinned; - - /* protected by the delalloc lock, used to keep from writing - * metadata until there is a nice batch - */ - u64 dirty_metadata_bytes; struct list_head dirty_cowonly_roots; struct btrfs_fs_devices *fs_devices; @@ -1748,15 +1704,18 @@ static inline struct dentry *fdentry(struct file *file) } /* extent-tree.c */ -int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, - struct btrfs_root *root, unsigned long count); int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); +int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u32 *refs); int btrfs_update_pinned_extents(struct btrfs_root *root, u64 bytenr, u64 num, int pin); int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *leaf); int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid, u64 bytenr); +int btrfs_extent_post_op(struct btrfs_trans_handle *trans, + struct btrfs_root *root); int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy); struct btrfs_block_group_cache *btrfs_lookup_block_group( struct btrfs_fs_info *info, @@ -1818,7 +1777,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, u64 root_objectid, u64 ref_generation, u64 owner_objectid); int btrfs_update_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, u64 num_bytes, + struct btrfs_root *root, u64 bytenr, u64 orig_parent, u64 parent, u64 root_objectid, u64 ref_generation, u64 owner_objectid); @@ -1879,7 +1838,7 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, - struct extent_buffer **cow_ret); + struct extent_buffer **cow_ret, u64 prealloc_dest); int btrfs_copy_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, diff --git a/trunk/fs/btrfs/delayed-ref.c b/trunk/fs/btrfs/delayed-ref.c deleted file mode 100644 index cbf7dc8ae3ec..000000000000 --- a/trunk/fs/btrfs/delayed-ref.c +++ /dev/null @@ -1,669 +0,0 @@ -/* - * Copyright (C) 2009 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include -#include -#include -#include "ctree.h" -#include "delayed-ref.h" -#include "transaction.h" - -/* - * delayed back reference update tracking. For subvolume trees - * we queue up extent allocations and backref maintenance for - * delayed processing. This avoids deep call chains where we - * add extents in the middle of btrfs_search_slot, and it allows - * us to buffer up frequently modified backrefs in an rb tree instead - * of hammering updates on the extent allocation tree. - * - * Right now this code is only used for reference counted trees, but - * the long term goal is to get rid of the similar code for delayed - * extent tree modifications. - */ - -/* - * entries in the rb tree are ordered by the byte number of the extent - * and by the byte number of the parent block. - */ -static int comp_entry(struct btrfs_delayed_ref_node *ref, - u64 bytenr, u64 parent) -{ - if (bytenr < ref->bytenr) - return -1; - if (bytenr > ref->bytenr) - return 1; - if (parent < ref->parent) - return -1; - if (parent > ref->parent) - return 1; - return 0; -} - -/* - * insert a new ref into the rbtree. This returns any existing refs - * for the same (bytenr,parent) tuple, or NULL if the new node was properly - * inserted. - */ -static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root, - u64 bytenr, u64 parent, - struct rb_node *node) -{ - struct rb_node **p = &root->rb_node; - struct rb_node *parent_node = NULL; - struct btrfs_delayed_ref_node *entry; - int cmp; - - while (*p) { - parent_node = *p; - entry = rb_entry(parent_node, struct btrfs_delayed_ref_node, - rb_node); - - cmp = comp_entry(entry, bytenr, parent); - if (cmp < 0) - p = &(*p)->rb_left; - else if (cmp > 0) - p = &(*p)->rb_right; - else - return entry; - } - - entry = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - rb_link_node(node, parent_node, p); - rb_insert_color(node, root); - return NULL; -} - -/* - * find an entry based on (bytenr,parent). This returns the delayed - * ref if it was able to find one, or NULL if nothing was in that spot - */ -static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root, - u64 bytenr, u64 parent, - struct btrfs_delayed_ref_node **last) -{ - struct rb_node *n = root->rb_node; - struct btrfs_delayed_ref_node *entry; - int cmp; - - while (n) { - entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); - WARN_ON(!entry->in_tree); - if (last) - *last = entry; - - cmp = comp_entry(entry, bytenr, parent); - if (cmp < 0) - n = n->rb_left; - else if (cmp > 0) - n = n->rb_right; - else - return entry; - } - return NULL; -} - -int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_head *head) -{ - struct btrfs_delayed_ref_root *delayed_refs; - - delayed_refs = &trans->transaction->delayed_refs; - assert_spin_locked(&delayed_refs->lock); - if (mutex_trylock(&head->mutex)) - return 0; - - atomic_inc(&head->node.refs); - spin_unlock(&delayed_refs->lock); - - mutex_lock(&head->mutex); - spin_lock(&delayed_refs->lock); - if (!head->node.in_tree) { - mutex_unlock(&head->mutex); - btrfs_put_delayed_ref(&head->node); - return -EAGAIN; - } - btrfs_put_delayed_ref(&head->node); - return 0; -} - -int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, - struct list_head *cluster, u64 start) -{ - int count = 0; - struct btrfs_delayed_ref_root *delayed_refs; - struct rb_node *node; - struct btrfs_delayed_ref_node *ref; - struct btrfs_delayed_ref_head *head; - - delayed_refs = &trans->transaction->delayed_refs; - if (start == 0) { - node = rb_first(&delayed_refs->root); - } else { - ref = NULL; - tree_search(&delayed_refs->root, start, (u64)-1, &ref); - if (ref) { - struct btrfs_delayed_ref_node *tmp; - - node = rb_prev(&ref->rb_node); - while (node) { - tmp = rb_entry(node, - struct btrfs_delayed_ref_node, - rb_node); - if (tmp->bytenr < start) - break; - ref = tmp; - node = rb_prev(&ref->rb_node); - } - node = &ref->rb_node; - } else - node = rb_first(&delayed_refs->root); - } -again: - while (node && count < 32) { - ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - if (btrfs_delayed_ref_is_head(ref)) { - head = btrfs_delayed_node_to_head(ref); - if (list_empty(&head->cluster)) { - list_add_tail(&head->cluster, cluster); - delayed_refs->run_delayed_start = - head->node.bytenr; - count++; - - WARN_ON(delayed_refs->num_heads_ready == 0); - delayed_refs->num_heads_ready--; - } else if (count) { - /* the goal of the clustering is to find extents - * that are likely to end up in the same extent - * leaf on disk. So, we don't want them spread - * all over the tree. Stop now if we've hit - * a head that was already in use - */ - break; - } - } - node = rb_next(node); - } - if (count) { - return 0; - } else if (start) { - /* - * we've gone to the end of the rbtree without finding any - * clusters. start from the beginning and try again - */ - start = 0; - node = rb_first(&delayed_refs->root); - goto again; - } - return 1; -} - -/* - * This checks to see if there are any delayed refs in the - * btree for a given bytenr. It returns one if it finds any - * and zero otherwise. - * - * If it only finds a head node, it returns 0. - * - * The idea is to use this when deciding if you can safely delete an - * extent from the extent allocation tree. There may be a pending - * ref in the rbtree that adds or removes references, so as long as this - * returns one you need to leave the BTRFS_EXTENT_ITEM in the extent - * allocation tree. - */ -int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr) -{ - struct btrfs_delayed_ref_node *ref; - struct btrfs_delayed_ref_root *delayed_refs; - struct rb_node *prev_node; - int ret = 0; - - delayed_refs = &trans->transaction->delayed_refs; - spin_lock(&delayed_refs->lock); - - ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL); - if (ref) { - prev_node = rb_prev(&ref->rb_node); - if (!prev_node) - goto out; - ref = rb_entry(prev_node, struct btrfs_delayed_ref_node, - rb_node); - if (ref->bytenr == bytenr) - ret = 1; - } -out: - spin_unlock(&delayed_refs->lock); - return ret; -} - -/* - * helper function to lookup reference count - * - * the head node for delayed ref is used to store the sum of all the - * reference count modifications queued up in the rbtree. This way you - * can check to see what the reference count would be if all of the - * delayed refs are processed. - */ -int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, - u64 num_bytes, u32 *refs) -{ - struct btrfs_delayed_ref_node *ref; - struct btrfs_delayed_ref_head *head; - struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_path *path; - struct extent_buffer *leaf; - struct btrfs_extent_item *ei; - struct btrfs_key key; - u32 num_refs; - int ret; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - key.objectid = bytenr; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = num_bytes; - delayed_refs = &trans->transaction->delayed_refs; -again: - ret = btrfs_search_slot(trans, root->fs_info->extent_root, - &key, path, 0, 0); - if (ret < 0) - goto out; - - if (ret == 0) { - leaf = path->nodes[0]; - ei = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_item); - num_refs = btrfs_extent_refs(leaf, ei); - } else { - num_refs = 0; - ret = 0; - } - - spin_lock(&delayed_refs->lock); - ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL); - if (ref) { - head = btrfs_delayed_node_to_head(ref); - if (mutex_trylock(&head->mutex)) { - num_refs += ref->ref_mod; - mutex_unlock(&head->mutex); - *refs = num_refs; - goto out; - } - - atomic_inc(&ref->refs); - spin_unlock(&delayed_refs->lock); - - btrfs_release_path(root->fs_info->extent_root, path); - - mutex_lock(&head->mutex); - mutex_unlock(&head->mutex); - btrfs_put_delayed_ref(ref); - goto again; - } else { - *refs = num_refs; - } -out: - spin_unlock(&delayed_refs->lock); - btrfs_free_path(path); - return ret; -} - -/* - * helper function to update an extent delayed ref in the - * rbtree. existing and update must both have the same - * bytenr and parent - * - * This may free existing if the update cancels out whatever - * operation it was doing. - */ -static noinline void -update_existing_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_delayed_ref_node *existing, - struct btrfs_delayed_ref_node *update) -{ - struct btrfs_delayed_ref *existing_ref; - struct btrfs_delayed_ref *ref; - - existing_ref = btrfs_delayed_node_to_ref(existing); - ref = btrfs_delayed_node_to_ref(update); - - if (ref->pin) - existing_ref->pin = 1; - - if (ref->action != existing_ref->action) { - /* - * this is effectively undoing either an add or a - * drop. We decrement the ref_mod, and if it goes - * down to zero we just delete the entry without - * every changing the extent allocation tree. - */ - existing->ref_mod--; - if (existing->ref_mod == 0) { - rb_erase(&existing->rb_node, - &delayed_refs->root); - existing->in_tree = 0; - btrfs_put_delayed_ref(existing); - delayed_refs->num_entries--; - if (trans->delayed_ref_updates) - trans->delayed_ref_updates--; - } - } else { - if (existing_ref->action == BTRFS_ADD_DELAYED_REF) { - /* if we're adding refs, make sure all the - * details match up. The extent could - * have been totally freed and reallocated - * by a different owner before the delayed - * ref entries were removed. - */ - existing_ref->owner_objectid = ref->owner_objectid; - existing_ref->generation = ref->generation; - existing_ref->root = ref->root; - existing->num_bytes = update->num_bytes; - } - /* - * the action on the existing ref matches - * the action on the ref we're trying to add. - * Bump the ref_mod by one so the backref that - * is eventually added/removed has the correct - * reference count - */ - existing->ref_mod += update->ref_mod; - } -} - -/* - * helper function to update the accounting in the head ref - * existing and update must have the same bytenr - */ -static noinline void -update_existing_head_ref(struct btrfs_delayed_ref_node *existing, - struct btrfs_delayed_ref_node *update) -{ - struct btrfs_delayed_ref_head *existing_ref; - struct btrfs_delayed_ref_head *ref; - - existing_ref = btrfs_delayed_node_to_head(existing); - ref = btrfs_delayed_node_to_head(update); - - if (ref->must_insert_reserved) { - /* if the extent was freed and then - * reallocated before the delayed ref - * entries were processed, we can end up - * with an existing head ref without - * the must_insert_reserved flag set. - * Set it again here - */ - existing_ref->must_insert_reserved = ref->must_insert_reserved; - - /* - * update the num_bytes so we make sure the accounting - * is done correctly - */ - existing->num_bytes = update->num_bytes; - - } - - /* - * update the reference mod on the head to reflect this new operation - */ - existing->ref_mod += update->ref_mod; -} - -/* - * helper function to actually insert a delayed ref into the rbtree. - * this does all the dirty work in terms of maintaining the correct - * overall modification count in the head node and properly dealing - * with updating existing nodes as new modifications are queued. - */ -static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *ref, - u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, - u64 ref_generation, u64 owner_objectid, int action, - int pin) -{ - struct btrfs_delayed_ref_node *existing; - struct btrfs_delayed_ref *full_ref; - struct btrfs_delayed_ref_head *head_ref = NULL; - struct btrfs_delayed_ref_root *delayed_refs; - int count_mod = 1; - int must_insert_reserved = 0; - - /* - * the head node stores the sum of all the mods, so dropping a ref - * should drop the sum in the head node by one. - */ - if (parent == (u64)-1) { - if (action == BTRFS_DROP_DELAYED_REF) - count_mod = -1; - else if (action == BTRFS_UPDATE_DELAYED_HEAD) - count_mod = 0; - } - - /* - * BTRFS_ADD_DELAYED_EXTENT means that we need to update - * the reserved accounting when the extent is finally added, or - * if a later modification deletes the delayed ref without ever - * inserting the extent into the extent allocation tree. - * ref->must_insert_reserved is the flag used to record - * that accounting mods are required. - * - * Once we record must_insert_reserved, switch the action to - * BTRFS_ADD_DELAYED_REF because other special casing is not required. - */ - if (action == BTRFS_ADD_DELAYED_EXTENT) { - must_insert_reserved = 1; - action = BTRFS_ADD_DELAYED_REF; - } else { - must_insert_reserved = 0; - } - - - delayed_refs = &trans->transaction->delayed_refs; - - /* first set the basic ref node struct up */ - atomic_set(&ref->refs, 1); - ref->bytenr = bytenr; - ref->parent = parent; - ref->ref_mod = count_mod; - ref->in_tree = 1; - ref->num_bytes = num_bytes; - - if (btrfs_delayed_ref_is_head(ref)) { - head_ref = btrfs_delayed_node_to_head(ref); - head_ref->must_insert_reserved = must_insert_reserved; - INIT_LIST_HEAD(&head_ref->cluster); - mutex_init(&head_ref->mutex); - } else { - full_ref = btrfs_delayed_node_to_ref(ref); - full_ref->root = ref_root; - full_ref->generation = ref_generation; - full_ref->owner_objectid = owner_objectid; - full_ref->pin = pin; - full_ref->action = action; - } - - existing = tree_insert(&delayed_refs->root, bytenr, - parent, &ref->rb_node); - - if (existing) { - if (btrfs_delayed_ref_is_head(ref)) - update_existing_head_ref(existing, ref); - else - update_existing_ref(trans, delayed_refs, existing, ref); - - /* - * we've updated the existing ref, free the newly - * allocated ref - */ - kfree(ref); - } else { - if (btrfs_delayed_ref_is_head(ref)) { - delayed_refs->num_heads++; - delayed_refs->num_heads_ready++; - } - delayed_refs->num_entries++; - trans->delayed_ref_updates++; - } - return 0; -} - -/* - * add a delayed ref to the tree. This does all of the accounting required - * to make sure the delayed ref is eventually processed before this - * transaction commits. - */ -int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, - u64 ref_generation, u64 owner_objectid, int action, - int pin) -{ - struct btrfs_delayed_ref *ref; - struct btrfs_delayed_ref_head *head_ref; - struct btrfs_delayed_ref_root *delayed_refs; - int ret; - - ref = kmalloc(sizeof(*ref), GFP_NOFS); - if (!ref) - return -ENOMEM; - - /* - * the parent = 0 case comes from cases where we don't actually - * know the parent yet. It will get updated later via a add/drop - * pair. - */ - if (parent == 0) - parent = bytenr; - - head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); - if (!head_ref) { - kfree(ref); - return -ENOMEM; - } - delayed_refs = &trans->transaction->delayed_refs; - spin_lock(&delayed_refs->lock); - - /* - * insert both the head node and the new ref without dropping - * the spin lock - */ - ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes, - (u64)-1, 0, 0, 0, action, pin); - BUG_ON(ret); - - ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes, - parent, ref_root, ref_generation, - owner_objectid, action, pin); - BUG_ON(ret); - spin_unlock(&delayed_refs->lock); - return 0; -} - -/* - * this does a simple search for the head node for a given extent. - * It must be called with the delayed ref spinlock held, and it returns - * the head node if any where found, or NULL if not. - */ -struct btrfs_delayed_ref_head * -btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr) -{ - struct btrfs_delayed_ref_node *ref; - struct btrfs_delayed_ref_root *delayed_refs; - - delayed_refs = &trans->transaction->delayed_refs; - ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL); - if (ref) - return btrfs_delayed_node_to_head(ref); - return NULL; -} - -/* - * add a delayed ref to the tree. This does all of the accounting required - * to make sure the delayed ref is eventually processed before this - * transaction commits. - * - * The main point of this call is to add and remove a backreference in a single - * shot, taking the lock only once, and only searching for the head node once. - * - * It is the same as doing a ref add and delete in two separate calls. - */ -int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, u64 orig_parent, - u64 parent, u64 orig_ref_root, u64 ref_root, - u64 orig_ref_generation, u64 ref_generation, - u64 owner_objectid, int pin) -{ - struct btrfs_delayed_ref *ref; - struct btrfs_delayed_ref *old_ref; - struct btrfs_delayed_ref_head *head_ref; - struct btrfs_delayed_ref_root *delayed_refs; - int ret; - - ref = kmalloc(sizeof(*ref), GFP_NOFS); - if (!ref) - return -ENOMEM; - - old_ref = kmalloc(sizeof(*old_ref), GFP_NOFS); - if (!old_ref) { - kfree(ref); - return -ENOMEM; - } - - /* - * the parent = 0 case comes from cases where we don't actually - * know the parent yet. It will get updated later via a add/drop - * pair. - */ - if (parent == 0) - parent = bytenr; - if (orig_parent == 0) - orig_parent = bytenr; - - head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); - if (!head_ref) { - kfree(ref); - kfree(old_ref); - return -ENOMEM; - } - delayed_refs = &trans->transaction->delayed_refs; - spin_lock(&delayed_refs->lock); - - /* - * insert both the head node and the new ref without dropping - * the spin lock - */ - ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes, - (u64)-1, 0, 0, 0, - BTRFS_UPDATE_DELAYED_HEAD, 0); - BUG_ON(ret); - - ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes, - parent, ref_root, ref_generation, - owner_objectid, BTRFS_ADD_DELAYED_REF, 0); - BUG_ON(ret); - - ret = __btrfs_add_delayed_ref(trans, &old_ref->node, bytenr, num_bytes, - orig_parent, orig_ref_root, - orig_ref_generation, owner_objectid, - BTRFS_DROP_DELAYED_REF, pin); - BUG_ON(ret); - spin_unlock(&delayed_refs->lock); - return 0; -} diff --git a/trunk/fs/btrfs/delayed-ref.h b/trunk/fs/btrfs/delayed-ref.h deleted file mode 100644 index 3bec2ff0b15c..000000000000 --- a/trunk/fs/btrfs/delayed-ref.h +++ /dev/null @@ -1,193 +0,0 @@ -/* - * Copyright (C) 2008 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ -#ifndef __DELAYED_REF__ -#define __DELAYED_REF__ - -/* these are the possible values of struct btrfs_delayed_ref->action */ -#define BTRFS_ADD_DELAYED_REF 1 /* add one backref to the tree */ -#define BTRFS_DROP_DELAYED_REF 2 /* delete one backref from the tree */ -#define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */ -#define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */ - -struct btrfs_delayed_ref_node { - struct rb_node rb_node; - - /* the starting bytenr of the extent */ - u64 bytenr; - - /* the parent our backref will point to */ - u64 parent; - - /* the size of the extent */ - u64 num_bytes; - - /* ref count on this data structure */ - atomic_t refs; - - /* - * how many refs is this entry adding or deleting. For - * head refs, this may be a negative number because it is keeping - * track of the total mods done to the reference count. - * For individual refs, this will always be a positive number - * - * It may be more than one, since it is possible for a single - * parent to have more than one ref on an extent - */ - int ref_mod; - - /* is this node still in the rbtree? */ - unsigned int in_tree:1; -}; - -/* - * the head refs are used to hold a lock on a given extent, which allows us - * to make sure that only one process is running the delayed refs - * at a time for a single extent. They also store the sum of all the - * reference count modifications we've queued up. - */ -struct btrfs_delayed_ref_head { - struct btrfs_delayed_ref_node node; - - /* - * the mutex is held while running the refs, and it is also - * held when checking the sum of reference modifications. - */ - struct mutex mutex; - - struct list_head cluster; - - /* - * when a new extent is allocated, it is just reserved in memory - * The actual extent isn't inserted into the extent allocation tree - * until the delayed ref is processed. must_insert_reserved is - * used to flag a delayed ref so the accounting can be updated - * when a full insert is done. - * - * It is possible the extent will be freed before it is ever - * inserted into the extent allocation tree. In this case - * we need to update the in ram accounting to properly reflect - * the free has happened. - */ - unsigned int must_insert_reserved:1; -}; - -struct btrfs_delayed_ref { - struct btrfs_delayed_ref_node node; - - /* the root objectid our ref will point to */ - u64 root; - - /* the generation for the backref */ - u64 generation; - - /* owner_objectid of the backref */ - u64 owner_objectid; - - /* operation done by this entry in the rbtree */ - u8 action; - - /* if pin == 1, when the extent is freed it will be pinned until - * transaction commit - */ - unsigned int pin:1; -}; - -struct btrfs_delayed_ref_root { - struct rb_root root; - - /* this spin lock protects the rbtree and the entries inside */ - spinlock_t lock; - - /* how many delayed ref updates we've queued, used by the - * throttling code - */ - unsigned long num_entries; - - /* total number of head nodes in tree */ - unsigned long num_heads; - - /* total number of head nodes ready for processing */ - unsigned long num_heads_ready; - - /* - * set when the tree is flushing before a transaction commit, - * used by the throttling code to decide if new updates need - * to be run right away - */ - int flushing; - - u64 run_delayed_start; -}; - -static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) -{ - WARN_ON(atomic_read(&ref->refs) == 0); - if (atomic_dec_and_test(&ref->refs)) { - WARN_ON(ref->in_tree); - kfree(ref); - } -} - -int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, - u64 ref_generation, u64 owner_objectid, int action, - int pin); - -struct btrfs_delayed_ref_head * -btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr); -int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr); -int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, - u64 num_bytes, u32 *refs); -int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, u64 orig_parent, - u64 parent, u64 orig_ref_root, u64 ref_root, - u64 orig_ref_generation, u64 ref_generation, - u64 owner_objectid, int pin); -int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_head *head); -int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, - struct list_head *cluster, u64 search_start); -/* - * a node might live in a head or a regular ref, this lets you - * test for the proper type to use. - */ -static int btrfs_delayed_ref_is_head(struct btrfs_delayed_ref_node *node) -{ - return node->parent == (u64)-1; -} - -/* - * helper functions to cast a node into its container - */ -static inline struct btrfs_delayed_ref * -btrfs_delayed_node_to_ref(struct btrfs_delayed_ref_node *node) -{ - WARN_ON(btrfs_delayed_ref_is_head(node)); - return container_of(node, struct btrfs_delayed_ref, node); - -} - -static inline struct btrfs_delayed_ref_head * -btrfs_delayed_node_to_head(struct btrfs_delayed_ref_node *node) -{ - WARN_ON(!btrfs_delayed_ref_is_head(node)); - return container_of(node, struct btrfs_delayed_ref_head, node); - -} -#endif diff --git a/trunk/fs/btrfs/dir-item.c b/trunk/fs/btrfs/dir-item.c index 1d70236ba00c..926a0b287a7d 100644 --- a/trunk/fs/btrfs/dir-item.c +++ b/trunk/fs/btrfs/dir-item.c @@ -145,10 +145,7 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root key.objectid = dir; btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); key.offset = btrfs_name_hash(name, name_len); - path = btrfs_alloc_path(); - path->leave_spinning = 1; - data_size = sizeof(*dir_item) + name_len; dir_item = insert_with_overflow(trans, root, path, &key, data_size, name, name_len); diff --git a/trunk/fs/btrfs/disk-io.c b/trunk/fs/btrfs/disk-io.c index 92d73929d381..6ec80c0fc869 100644 --- a/trunk/fs/btrfs/disk-io.c +++ b/trunk/fs/btrfs/disk-io.c @@ -668,31 +668,14 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, static int btree_writepage(struct page *page, struct writeback_control *wbc) { struct extent_io_tree *tree; - struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; - struct extent_buffer *eb; - int was_dirty; - tree = &BTRFS_I(page->mapping->host)->io_tree; - if (!(current->flags & PF_MEMALLOC)) { - return extent_write_full_page(tree, page, - btree_get_extent, wbc); - } - - redirty_page_for_writepage(wbc, page); - eb = btrfs_find_tree_block(root, page_offset(page), - PAGE_CACHE_SIZE); - WARN_ON(!eb); - was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); - if (!was_dirty) { - spin_lock(&root->fs_info->delalloc_lock); - root->fs_info->dirty_metadata_bytes += PAGE_CACHE_SIZE; - spin_unlock(&root->fs_info->delalloc_lock); + if (current->flags & PF_MEMALLOC) { + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; } - free_extent_buffer(eb); - - unlock_page(page); - return 0; + return extent_write_full_page(tree, page, btree_get_extent, wbc); } static int btree_writepages(struct address_space *mapping, @@ -701,15 +684,15 @@ static int btree_writepages(struct address_space *mapping, struct extent_io_tree *tree; tree = &BTRFS_I(mapping->host)->io_tree; if (wbc->sync_mode == WB_SYNC_NONE) { - struct btrfs_root *root = BTRFS_I(mapping->host)->root; u64 num_dirty; + u64 start = 0; unsigned long thresh = 32 * 1024 * 1024; if (wbc->for_kupdate) return 0; - /* this is a bit racy, but that's ok */ - num_dirty = root->fs_info->dirty_metadata_bytes; + num_dirty = count_range_bits(tree, &start, (u64)-1, + thresh, EXTENT_DIRTY); if (num_dirty < thresh) return 0; } @@ -876,17 +859,9 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, root->fs_info->running_transaction->transid) { btrfs_assert_tree_locked(buf); - if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) { - spin_lock(&root->fs_info->delalloc_lock); - if (root->fs_info->dirty_metadata_bytes >= buf->len) - root->fs_info->dirty_metadata_bytes -= buf->len; - else - WARN_ON(1); - spin_unlock(&root->fs_info->delalloc_lock); - } - - /* ugh, clear_extent_buffer_dirty needs to lock the page */ + /* ugh, clear_extent_buffer_dirty can be expensive */ btrfs_set_lock_blocking(buf); + clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, buf); } @@ -1496,6 +1471,12 @@ static int transaction_kthread(void *arg) vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); mutex_lock(&root->fs_info->transaction_kthread_mutex); + if (root->fs_info->total_ref_cache_size > 20 * 1024 * 1024) { + printk(KERN_INFO "btrfs: total reference cache " + "size %llu\n", + root->fs_info->total_ref_cache_size); + } + mutex_lock(&root->fs_info->trans_mutex); cur = root->fs_info->running_transaction; if (!cur) { @@ -1512,7 +1493,6 @@ static int transaction_kthread(void *arg) mutex_unlock(&root->fs_info->trans_mutex); trans = btrfs_start_transaction(root, 1); ret = btrfs_commit_transaction(trans, root); - sleep: wake_up_process(root->fs_info->cleaner_kthread); mutex_unlock(&root->fs_info->transaction_kthread_mutex); @@ -1572,7 +1552,6 @@ struct btrfs_root *open_ctree(struct super_block *sb, INIT_LIST_HEAD(&fs_info->dead_roots); INIT_LIST_HEAD(&fs_info->hashers); INIT_LIST_HEAD(&fs_info->delalloc_inodes); - INIT_LIST_HEAD(&fs_info->ordered_operations); spin_lock_init(&fs_info->delalloc_lock); spin_lock_init(&fs_info->new_trans_lock); spin_lock_init(&fs_info->ref_cache_lock); @@ -1632,6 +1611,10 @@ struct btrfs_root *open_ctree(struct super_block *sb, extent_io_tree_init(&fs_info->pinned_extents, fs_info->btree_inode->i_mapping, GFP_NOFS); + extent_io_tree_init(&fs_info->pending_del, + fs_info->btree_inode->i_mapping, GFP_NOFS); + extent_io_tree_init(&fs_info->extent_ins, + fs_info->btree_inode->i_mapping, GFP_NOFS); fs_info->do_barriers = 1; INIT_LIST_HEAD(&fs_info->dead_reloc_roots); @@ -1644,9 +1627,9 @@ struct btrfs_root *open_ctree(struct super_block *sb, insert_inode_hash(fs_info->btree_inode); mutex_init(&fs_info->trans_mutex); - mutex_init(&fs_info->ordered_operations_mutex); mutex_init(&fs_info->tree_log_mutex); mutex_init(&fs_info->drop_mutex); + mutex_init(&fs_info->extent_ins_mutex); mutex_init(&fs_info->pinned_mutex); mutex_init(&fs_info->chunk_mutex); mutex_init(&fs_info->transaction_kthread_mutex); @@ -2375,7 +2358,8 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; u64 transid = btrfs_header_generation(buf); struct inode *btree_inode = root->fs_info->btree_inode; - int was_dirty; + + btrfs_set_lock_blocking(buf); btrfs_assert_tree_locked(buf); if (transid != root->fs_info->generation) { @@ -2386,13 +2370,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) (unsigned long long)root->fs_info->generation); WARN_ON(1); } - was_dirty = set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, - buf); - if (!was_dirty) { - spin_lock(&root->fs_info->delalloc_lock); - root->fs_info->dirty_metadata_bytes += buf->len; - spin_unlock(&root->fs_info->delalloc_lock); - } + set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, buf); } void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) @@ -2432,7 +2410,6 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) int btree_lock_page_hook(struct page *page) { struct inode *inode = page->mapping->host; - struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_buffer *eb; unsigned long len; @@ -2448,16 +2425,6 @@ int btree_lock_page_hook(struct page *page) btrfs_tree_lock(eb); btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); - - if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { - spin_lock(&root->fs_info->delalloc_lock); - if (root->fs_info->dirty_metadata_bytes >= eb->len) - root->fs_info->dirty_metadata_bytes -= eb->len; - else - WARN_ON(1); - spin_unlock(&root->fs_info->delalloc_lock); - } - btrfs_tree_unlock(eb); free_extent_buffer(eb); out: diff --git a/trunk/fs/btrfs/disk-io.h b/trunk/fs/btrfs/disk-io.h index c958ecbc1916..95029db227be 100644 --- a/trunk/fs/btrfs/disk-io.h +++ b/trunk/fs/btrfs/disk-io.h @@ -72,7 +72,6 @@ int btrfs_insert_dev_radix(struct btrfs_root *root, void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); void btrfs_mark_buffer_dirty(struct extent_buffer *buf); -void btrfs_mark_buffer_dirty_nonblocking(struct extent_buffer *buf); int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid); int btrfs_set_buffer_uptodate(struct extent_buffer *buf); int wait_on_tree_block_writeback(struct btrfs_root *root, diff --git a/trunk/fs/btrfs/extent-tree.c b/trunk/fs/btrfs/extent-tree.c index f5e7cae63d80..fefe83ad2059 100644 --- a/trunk/fs/btrfs/extent-tree.c +++ b/trunk/fs/btrfs/extent-tree.c @@ -49,23 +49,17 @@ struct pending_extent_op { int del; }; -static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 parent, - u64 root_objectid, u64 ref_generation, - u64 owner, struct btrfs_key *ins, - int ref_mod); -static int update_reserved_extents(struct btrfs_root *root, - u64 bytenr, u64 num, int reserve); +static int finish_current_insert(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, int all); +static int del_pending_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, int all); +static int pin_down_bytes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, int is_data); static int update_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, int alloc, int mark_free); -static noinline int __btrfs_free_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 ref_generation, - u64 owner_objectid, int pin, - int ref_to_drop); static int do_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 alloc_bytes, @@ -560,13 +554,262 @@ static noinline int lookup_extent_backref(struct btrfs_trans_handle *trans, return ret; } +/* + * updates all the backrefs that are pending on update_list for the + * extent_root + */ +static noinline int update_backrefs(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, + struct btrfs_path *path, + struct list_head *update_list) +{ + struct btrfs_key key; + struct btrfs_extent_ref *ref; + struct btrfs_fs_info *info = extent_root->fs_info; + struct pending_extent_op *op; + struct extent_buffer *leaf; + int ret = 0; + struct list_head *cur = update_list->next; + u64 ref_objectid; + u64 ref_root = extent_root->root_key.objectid; + + op = list_entry(cur, struct pending_extent_op, list); + +search: + key.objectid = op->bytenr; + key.type = BTRFS_EXTENT_REF_KEY; + key.offset = op->orig_parent; + + ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 1); + BUG_ON(ret); + + leaf = path->nodes[0]; + +loop: + ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref); + + ref_objectid = btrfs_ref_objectid(leaf, ref); + + if (btrfs_ref_root(leaf, ref) != ref_root || + btrfs_ref_generation(leaf, ref) != op->orig_generation || + (ref_objectid != op->level && + ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) { + printk(KERN_ERR "btrfs couldn't find %llu, parent %llu, " + "root %llu, owner %u\n", + (unsigned long long)op->bytenr, + (unsigned long long)op->orig_parent, + (unsigned long long)ref_root, op->level); + btrfs_print_leaf(extent_root, leaf); + BUG(); + } + + key.objectid = op->bytenr; + key.offset = op->parent; + key.type = BTRFS_EXTENT_REF_KEY; + ret = btrfs_set_item_key_safe(trans, extent_root, path, &key); + BUG_ON(ret); + ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref); + btrfs_set_ref_generation(leaf, ref, op->generation); + + cur = cur->next; + + list_del_init(&op->list); + unlock_extent(&info->extent_ins, op->bytenr, + op->bytenr + op->num_bytes - 1, GFP_NOFS); + kfree(op); + + if (cur == update_list) { + btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_release_path(extent_root, path); + goto out; + } + + op = list_entry(cur, struct pending_extent_op, list); + + path->slots[0]++; + while (path->slots[0] < btrfs_header_nritems(leaf)) { + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid == op->bytenr && + key.type == BTRFS_EXTENT_REF_KEY) + goto loop; + path->slots[0]++; + } + + btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_release_path(extent_root, path); + goto search; + +out: + return 0; +} + +static noinline int insert_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, + struct btrfs_path *path, + struct list_head *insert_list, int nr) +{ + struct btrfs_key *keys; + u32 *data_size; + struct pending_extent_op *op; + struct extent_buffer *leaf; + struct list_head *cur = insert_list->next; + struct btrfs_fs_info *info = extent_root->fs_info; + u64 ref_root = extent_root->root_key.objectid; + int i = 0, last = 0, ret; + int total = nr * 2; + + if (!nr) + return 0; + + keys = kzalloc(total * sizeof(struct btrfs_key), GFP_NOFS); + if (!keys) + return -ENOMEM; + + data_size = kzalloc(total * sizeof(u32), GFP_NOFS); + if (!data_size) { + kfree(keys); + return -ENOMEM; + } + + list_for_each_entry(op, insert_list, list) { + keys[i].objectid = op->bytenr; + keys[i].offset = op->num_bytes; + keys[i].type = BTRFS_EXTENT_ITEM_KEY; + data_size[i] = sizeof(struct btrfs_extent_item); + i++; + + keys[i].objectid = op->bytenr; + keys[i].offset = op->parent; + keys[i].type = BTRFS_EXTENT_REF_KEY; + data_size[i] = sizeof(struct btrfs_extent_ref); + i++; + } + + op = list_entry(cur, struct pending_extent_op, list); + i = 0; + while (i < total) { + int c; + ret = btrfs_insert_some_items(trans, extent_root, path, + keys+i, data_size+i, total-i); + BUG_ON(ret < 0); + + if (last && ret > 1) + BUG(); + + leaf = path->nodes[0]; + for (c = 0; c < ret; c++) { + int ref_first = keys[i].type == BTRFS_EXTENT_REF_KEY; + + /* + * if the first item we inserted was a backref, then + * the EXTENT_ITEM will be the odd c's, else it will + * be the even c's + */ + if ((ref_first && (c % 2)) || + (!ref_first && !(c % 2))) { + struct btrfs_extent_item *itm; + + itm = btrfs_item_ptr(leaf, path->slots[0] + c, + struct btrfs_extent_item); + btrfs_set_extent_refs(path->nodes[0], itm, 1); + op->del++; + } else { + struct btrfs_extent_ref *ref; + + ref = btrfs_item_ptr(leaf, path->slots[0] + c, + struct btrfs_extent_ref); + btrfs_set_ref_root(leaf, ref, ref_root); + btrfs_set_ref_generation(leaf, ref, + op->generation); + btrfs_set_ref_objectid(leaf, ref, op->level); + btrfs_set_ref_num_refs(leaf, ref, 1); + op->del++; + } + + /* + * using del to see when its ok to free up the + * pending_extent_op. In the case where we insert the + * last item on the list in order to help do batching + * we need to not free the extent op until we actually + * insert the extent_item + */ + if (op->del == 2) { + unlock_extent(&info->extent_ins, op->bytenr, + op->bytenr + op->num_bytes - 1, + GFP_NOFS); + cur = cur->next; + list_del_init(&op->list); + kfree(op); + if (cur != insert_list) + op = list_entry(cur, + struct pending_extent_op, + list); + } + } + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(extent_root, path); + + /* + * Ok backref's and items usually go right next to eachother, + * but if we could only insert 1 item that means that we + * inserted on the end of a leaf, and we have no idea what may + * be on the next leaf so we just play it safe. In order to + * try and help this case we insert the last thing on our + * insert list so hopefully it will end up being the last + * thing on the leaf and everything else will be before it, + * which will let us insert a whole bunch of items at the same + * time. + */ + if (ret == 1 && !last && (i + ret < total)) { + /* + * last: where we will pick up the next time around + * i: our current key to insert, will be total - 1 + * cur: the current op we are screwing with + * op: duh + */ + last = i + ret; + i = total - 1; + cur = insert_list->prev; + op = list_entry(cur, struct pending_extent_op, list); + } else if (last) { + /* + * ok we successfully inserted the last item on the + * list, lets reset everything + * + * i: our current key to insert, so where we left off + * last time + * last: done with this + * cur: the op we are messing with + * op: duh + * total: since we inserted the last key, we need to + * decrement total so we dont overflow + */ + i = last; + last = 0; + total--; + if (i < total) { + cur = insert_list->next; + op = list_entry(cur, struct pending_extent_op, + list); + } + } else { + i += ret; + } + + cond_resched(); + } + ret = 0; + kfree(keys); + kfree(data_size); + return ret; +} + static noinline int insert_extent_backref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 bytenr, u64 parent, u64 ref_root, u64 ref_generation, - u64 owner_objectid, - int refs_to_add) + u64 owner_objectid) { struct btrfs_key key; struct extent_buffer *leaf; @@ -586,10 +829,9 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans, btrfs_set_ref_root(leaf, ref, ref_root); btrfs_set_ref_generation(leaf, ref, ref_generation); btrfs_set_ref_objectid(leaf, ref, owner_objectid); - btrfs_set_ref_num_refs(leaf, ref, refs_to_add); + btrfs_set_ref_num_refs(leaf, ref, 1); } else if (ret == -EEXIST) { u64 existing_owner; - BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID); leaf = path->nodes[0]; ref = btrfs_item_ptr(leaf, path->slots[0], @@ -603,7 +845,7 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans, num_refs = btrfs_ref_num_refs(leaf, ref); BUG_ON(num_refs == 0); - btrfs_set_ref_num_refs(leaf, ref, num_refs + refs_to_add); + btrfs_set_ref_num_refs(leaf, ref, num_refs + 1); existing_owner = btrfs_ref_objectid(leaf, ref); if (existing_owner != owner_objectid && @@ -615,7 +857,6 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans, } else { goto out; } - btrfs_unlock_up_safe(path, 1); btrfs_mark_buffer_dirty(path->nodes[0]); out: btrfs_release_path(root, path); @@ -624,8 +865,7 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans, static noinline int remove_extent_backref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_path *path, - int refs_to_drop) + struct btrfs_path *path) { struct extent_buffer *leaf; struct btrfs_extent_ref *ref; @@ -635,8 +875,8 @@ static noinline int remove_extent_backref(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref); num_refs = btrfs_ref_num_refs(leaf, ref); - BUG_ON(num_refs < refs_to_drop); - num_refs -= refs_to_drop; + BUG_ON(num_refs == 0); + num_refs -= 1; if (num_refs == 0) { ret = btrfs_del_item(trans, root, path); } else { @@ -687,28 +927,332 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, #endif } +static noinline int free_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, + struct list_head *del_list) +{ + struct btrfs_fs_info *info = extent_root->fs_info; + struct btrfs_path *path; + struct btrfs_key key, found_key; + struct extent_buffer *leaf; + struct list_head *cur; + struct pending_extent_op *op; + struct btrfs_extent_item *ei; + int ret, num_to_del, extent_slot = 0, found_extent = 0; + u32 refs; + u64 bytes_freed = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->reada = 1; + +search: + /* search for the backref for the current ref we want to delete */ + cur = del_list->next; + op = list_entry(cur, struct pending_extent_op, list); + ret = lookup_extent_backref(trans, extent_root, path, op->bytenr, + op->orig_parent, + extent_root->root_key.objectid, + op->orig_generation, op->level, 1); + if (ret) { + printk(KERN_ERR "btrfs unable to find backref byte nr %llu " + "root %llu gen %llu owner %u\n", + (unsigned long long)op->bytenr, + (unsigned long long)extent_root->root_key.objectid, + (unsigned long long)op->orig_generation, op->level); + btrfs_print_leaf(extent_root, path->nodes[0]); + WARN_ON(1); + goto out; + } + + extent_slot = path->slots[0]; + num_to_del = 1; + found_extent = 0; + + /* + * if we aren't the first item on the leaf we can move back one and see + * if our ref is right next to our extent item + */ + if (likely(extent_slot)) { + extent_slot--; + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + extent_slot); + if (found_key.objectid == op->bytenr && + found_key.type == BTRFS_EXTENT_ITEM_KEY && + found_key.offset == op->num_bytes) { + num_to_del++; + found_extent = 1; + } + } + + /* + * if we didn't find the extent we need to delete the backref and then + * search for the extent item key so we can update its ref count + */ + if (!found_extent) { + key.objectid = op->bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = op->num_bytes; + + ret = remove_extent_backref(trans, extent_root, path); + BUG_ON(ret); + btrfs_release_path(extent_root, path); + ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1); + BUG_ON(ret); + extent_slot = path->slots[0]; + } + + /* this is where we update the ref count for the extent */ + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, extent_slot, struct btrfs_extent_item); + refs = btrfs_extent_refs(leaf, ei); + BUG_ON(refs == 0); + refs--; + btrfs_set_extent_refs(leaf, ei, refs); + + btrfs_mark_buffer_dirty(leaf); + + /* + * This extent needs deleting. The reason cur_slot is extent_slot + + * num_to_del is because extent_slot points to the slot where the extent + * is, and if the backref was not right next to the extent we will be + * deleting at least 1 item, and will want to start searching at the + * slot directly next to extent_slot. However if we did find the + * backref next to the extent item them we will be deleting at least 2 + * items and will want to start searching directly after the ref slot + */ + if (!refs) { + struct list_head *pos, *n, *end; + int cur_slot = extent_slot+num_to_del; + u64 super_used; + u64 root_used; + + path->slots[0] = extent_slot; + bytes_freed = op->num_bytes; + + mutex_lock(&info->pinned_mutex); + ret = pin_down_bytes(trans, extent_root, op->bytenr, + op->num_bytes, op->level >= + BTRFS_FIRST_FREE_OBJECTID); + mutex_unlock(&info->pinned_mutex); + BUG_ON(ret < 0); + op->del = ret; + + /* + * we need to see if we can delete multiple things at once, so + * start looping through the list of extents we are wanting to + * delete and see if their extent/backref's are right next to + * eachother and the extents only have 1 ref + */ + for (pos = cur->next; pos != del_list; pos = pos->next) { + struct pending_extent_op *tmp; + + tmp = list_entry(pos, struct pending_extent_op, list); + + /* we only want to delete extent+ref at this stage */ + if (cur_slot >= btrfs_header_nritems(leaf) - 1) + break; + + btrfs_item_key_to_cpu(leaf, &found_key, cur_slot); + if (found_key.objectid != tmp->bytenr || + found_key.type != BTRFS_EXTENT_ITEM_KEY || + found_key.offset != tmp->num_bytes) + break; + + /* check to make sure this extent only has one ref */ + ei = btrfs_item_ptr(leaf, cur_slot, + struct btrfs_extent_item); + if (btrfs_extent_refs(leaf, ei) != 1) + break; + + btrfs_item_key_to_cpu(leaf, &found_key, cur_slot+1); + if (found_key.objectid != tmp->bytenr || + found_key.type != BTRFS_EXTENT_REF_KEY || + found_key.offset != tmp->orig_parent) + break; + + /* + * the ref is right next to the extent, we can set the + * ref count to 0 since we will delete them both now + */ + btrfs_set_extent_refs(leaf, ei, 0); + + /* pin down the bytes for this extent */ + mutex_lock(&info->pinned_mutex); + ret = pin_down_bytes(trans, extent_root, tmp->bytenr, + tmp->num_bytes, tmp->level >= + BTRFS_FIRST_FREE_OBJECTID); + mutex_unlock(&info->pinned_mutex); + BUG_ON(ret < 0); + + /* + * use the del field to tell if we need to go ahead and + * free up the extent when we delete the item or not. + */ + tmp->del = ret; + bytes_freed += tmp->num_bytes; + + num_to_del += 2; + cur_slot += 2; + } + end = pos; + + /* update the free space counters */ + spin_lock(&info->delalloc_lock); + super_used = btrfs_super_bytes_used(&info->super_copy); + btrfs_set_super_bytes_used(&info->super_copy, + super_used - bytes_freed); + + root_used = btrfs_root_used(&extent_root->root_item); + btrfs_set_root_used(&extent_root->root_item, + root_used - bytes_freed); + spin_unlock(&info->delalloc_lock); + + /* delete the items */ + ret = btrfs_del_items(trans, extent_root, path, + path->slots[0], num_to_del); + BUG_ON(ret); + + /* + * loop through the extents we deleted and do the cleanup work + * on them + */ + for (pos = cur, n = pos->next; pos != end; + pos = n, n = pos->next) { + struct pending_extent_op *tmp; + tmp = list_entry(pos, struct pending_extent_op, list); + + /* + * remember tmp->del tells us wether or not we pinned + * down the extent + */ + ret = update_block_group(trans, extent_root, + tmp->bytenr, tmp->num_bytes, 0, + tmp->del); + BUG_ON(ret); + + list_del_init(&tmp->list); + unlock_extent(&info->extent_ins, tmp->bytenr, + tmp->bytenr + tmp->num_bytes - 1, + GFP_NOFS); + kfree(tmp); + } + } else if (refs && found_extent) { + /* + * the ref and extent were right next to eachother, but the + * extent still has a ref, so just free the backref and keep + * going + */ + ret = remove_extent_backref(trans, extent_root, path); + BUG_ON(ret); + + list_del_init(&op->list); + unlock_extent(&info->extent_ins, op->bytenr, + op->bytenr + op->num_bytes - 1, GFP_NOFS); + kfree(op); + } else { + /* + * the extent has multiple refs and the backref we were looking + * for was not right next to it, so just unlock and go next, + * we're good to go + */ + list_del_init(&op->list); + unlock_extent(&info->extent_ins, op->bytenr, + op->bytenr + op->num_bytes - 1, GFP_NOFS); + kfree(op); + } + + btrfs_release_path(extent_root, path); + if (!list_empty(del_list)) + goto search; + +out: + btrfs_free_path(path); + return ret; +} + static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, - u64 num_bytes, u64 orig_parent, u64 parent, u64 orig_root, u64 ref_root, u64 orig_generation, u64 ref_generation, u64 owner_objectid) { int ret; - int pin = owner_objectid < BTRFS_FIRST_FREE_OBJECTID; + struct btrfs_root *extent_root = root->fs_info->extent_root; + struct btrfs_path *path; + + if (root == root->fs_info->extent_root) { + struct pending_extent_op *extent_op; + u64 num_bytes; + + BUG_ON(owner_objectid >= BTRFS_MAX_LEVEL); + num_bytes = btrfs_level_size(root, (int)owner_objectid); + mutex_lock(&root->fs_info->extent_ins_mutex); + if (test_range_bit(&root->fs_info->extent_ins, bytenr, + bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) { + u64 priv; + ret = get_state_private(&root->fs_info->extent_ins, + bytenr, &priv); + BUG_ON(ret); + extent_op = (struct pending_extent_op *) + (unsigned long)priv; + BUG_ON(extent_op->parent != orig_parent); + BUG_ON(extent_op->generation != orig_generation); - ret = btrfs_update_delayed_ref(trans, bytenr, num_bytes, - orig_parent, parent, orig_root, - ref_root, orig_generation, - ref_generation, owner_objectid, pin); + extent_op->parent = parent; + extent_op->generation = ref_generation; + } else { + extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); + BUG_ON(!extent_op); + + extent_op->type = PENDING_BACKREF_UPDATE; + extent_op->bytenr = bytenr; + extent_op->num_bytes = num_bytes; + extent_op->parent = parent; + extent_op->orig_parent = orig_parent; + extent_op->generation = ref_generation; + extent_op->orig_generation = orig_generation; + extent_op->level = (int)owner_objectid; + INIT_LIST_HEAD(&extent_op->list); + extent_op->del = 0; + + set_extent_bits(&root->fs_info->extent_ins, + bytenr, bytenr + num_bytes - 1, + EXTENT_WRITEBACK, GFP_NOFS); + set_state_private(&root->fs_info->extent_ins, + bytenr, (unsigned long)extent_op); + } + mutex_unlock(&root->fs_info->extent_ins_mutex); + return 0; + } + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + ret = lookup_extent_backref(trans, extent_root, path, + bytenr, orig_parent, orig_root, + orig_generation, owner_objectid, 1); + if (ret) + goto out; + ret = remove_extent_backref(trans, extent_root, path); + if (ret) + goto out; + ret = insert_extent_backref(trans, extent_root, path, bytenr, + parent, ref_root, ref_generation, + owner_objectid); BUG_ON(ret); + finish_current_insert(trans, extent_root, 0); + del_pending_extents(trans, extent_root, 0); +out: + btrfs_free_path(path); return ret; } int btrfs_update_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, - u64 num_bytes, u64 orig_parent, u64 parent, + u64 orig_parent, u64 parent, u64 ref_root, u64 ref_generation, u64 owner_objectid) { @@ -716,35 +1260,19 @@ int btrfs_update_extent_ref(struct btrfs_trans_handle *trans, if (ref_root == BTRFS_TREE_LOG_OBJECTID && owner_objectid < BTRFS_FIRST_FREE_OBJECTID) return 0; - - ret = __btrfs_update_extent_ref(trans, root, bytenr, num_bytes, - orig_parent, parent, ref_root, - ref_root, ref_generation, - ref_generation, owner_objectid); + ret = __btrfs_update_extent_ref(trans, root, bytenr, orig_parent, + parent, ref_root, ref_root, + ref_generation, ref_generation, + owner_objectid); return ret; } + static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, - u64 num_bytes, u64 orig_parent, u64 parent, u64 orig_root, u64 ref_root, u64 orig_generation, u64 ref_generation, u64 owner_objectid) -{ - int ret; - - ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent, ref_root, - ref_generation, owner_objectid, - BTRFS_ADD_DELAYED_REF, 0); - BUG_ON(ret); - return ret; -} - -static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, - u64 num_bytes, u64 parent, u64 ref_root, - u64 ref_generation, u64 owner_objectid, - int refs_to_add) { struct btrfs_path *path; int ret; @@ -758,24 +1286,17 @@ static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans, return -ENOMEM; path->reada = 1; - path->leave_spinning = 1; key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = num_bytes; + key.offset = (u64)-1; - /* first find the extent item and update its reference count */ - ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, - path, 0, 1); - if (ret < 0) { - btrfs_set_path_blocking(path); + ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path, + 0, 1); + if (ret < 0) return ret; - } + BUG_ON(ret == 0 || path->slots[0] == 0); - if (ret > 0) { - WARN_ON(1); - btrfs_free_path(path); - return -EIO; - } + path->slots[0]--; l = path->nodes[0]; btrfs_item_key_to_cpu(l, &key, path->slots[0]); @@ -789,24 +1310,21 @@ static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans, BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY); item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item); - refs = btrfs_extent_refs(l, item); - btrfs_set_extent_refs(l, item, refs + refs_to_add); - btrfs_unlock_up_safe(path, 1); - + btrfs_set_extent_refs(l, item, refs + 1); btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_release_path(root->fs_info->extent_root, path); path->reada = 1; - path->leave_spinning = 1; - - /* now insert the actual backref */ ret = insert_extent_backref(trans, root->fs_info->extent_root, path, bytenr, parent, ref_root, ref_generation, - owner_objectid, refs_to_add); + owner_objectid); BUG_ON(ret); + finish_current_insert(trans, root->fs_info->extent_root, 0); + del_pending_extents(trans, root->fs_info->extent_root, 0); + btrfs_free_path(path); return 0; } @@ -821,278 +1339,68 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, if (ref_root == BTRFS_TREE_LOG_OBJECTID && owner_objectid < BTRFS_FIRST_FREE_OBJECTID) return 0; + ret = __btrfs_inc_extent_ref(trans, root, bytenr, 0, parent, + 0, ref_root, 0, ref_generation, + owner_objectid); + return ret; +} - ret = __btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, parent, - 0, ref_root, 0, ref_generation, - owner_objectid); - return ret; -} - -static int drop_delayed_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_delayed_ref_node *node) -{ - int ret = 0; - struct btrfs_delayed_ref *ref = btrfs_delayed_node_to_ref(node); - - BUG_ON(node->ref_mod == 0); - ret = __btrfs_free_extent(trans, root, node->bytenr, node->num_bytes, - node->parent, ref->root, ref->generation, - ref->owner_objectid, ref->pin, node->ref_mod); - - return ret; -} - -/* helper function to actually process a single delayed ref entry */ -static noinline int run_one_delayed_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_delayed_ref_node *node, - int insert_reserved) -{ - int ret; - struct btrfs_delayed_ref *ref; - - if (node->parent == (u64)-1) { - struct btrfs_delayed_ref_head *head; - /* - * we've hit the end of the chain and we were supposed - * to insert this extent into the tree. But, it got - * deleted before we ever needed to insert it, so all - * we have to do is clean up the accounting - */ - if (insert_reserved) { - update_reserved_extents(root, node->bytenr, - node->num_bytes, 0); - } - head = btrfs_delayed_node_to_head(node); - mutex_unlock(&head->mutex); - return 0; - } - - ref = btrfs_delayed_node_to_ref(node); - if (ref->action == BTRFS_ADD_DELAYED_REF) { - if (insert_reserved) { - struct btrfs_key ins; - - ins.objectid = node->bytenr; - ins.offset = node->num_bytes; - ins.type = BTRFS_EXTENT_ITEM_KEY; - - /* record the full extent allocation */ - ret = __btrfs_alloc_reserved_extent(trans, root, - node->parent, ref->root, - ref->generation, ref->owner_objectid, - &ins, node->ref_mod); - update_reserved_extents(root, node->bytenr, - node->num_bytes, 0); - } else { - /* just add one backref */ - ret = add_extent_ref(trans, root, node->bytenr, - node->num_bytes, - node->parent, ref->root, ref->generation, - ref->owner_objectid, node->ref_mod); - } - BUG_ON(ret); - } else if (ref->action == BTRFS_DROP_DELAYED_REF) { - WARN_ON(insert_reserved); - ret = drop_delayed_ref(trans, root, node); - } - return 0; -} - -static noinline struct btrfs_delayed_ref_node * -select_delayed_ref(struct btrfs_delayed_ref_head *head) -{ - struct rb_node *node; - struct btrfs_delayed_ref_node *ref; - int action = BTRFS_ADD_DELAYED_REF; -again: - /* - * select delayed ref of type BTRFS_ADD_DELAYED_REF first. - * this prevents ref count from going down to zero when - * there still are pending delayed ref. - */ - node = rb_prev(&head->node.rb_node); - while (1) { - if (!node) - break; - ref = rb_entry(node, struct btrfs_delayed_ref_node, - rb_node); - if (ref->bytenr != head->node.bytenr) - break; - if (btrfs_delayed_node_to_ref(ref)->action == action) - return ref; - node = rb_prev(node); - } - if (action == BTRFS_ADD_DELAYED_REF) { - action = BTRFS_DROP_DELAYED_REF; - goto again; - } - return NULL; -} - -static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct list_head *cluster) -{ - struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_delayed_ref_node *ref; - struct btrfs_delayed_ref_head *locked_ref = NULL; - int ret; - int count = 0; - int must_insert_reserved = 0; - - delayed_refs = &trans->transaction->delayed_refs; - while (1) { - if (!locked_ref) { - /* pick a new head ref from the cluster list */ - if (list_empty(cluster)) - break; - - locked_ref = list_entry(cluster->next, - struct btrfs_delayed_ref_head, cluster); - - /* grab the lock that says we are going to process - * all the refs for this head */ - ret = btrfs_delayed_ref_lock(trans, locked_ref); - - /* - * we may have dropped the spin lock to get the head - * mutex lock, and that might have given someone else - * time to free the head. If that's true, it has been - * removed from our list and we can move on. - */ - if (ret == -EAGAIN) { - locked_ref = NULL; - count++; - continue; - } - } - - /* - * record the must insert reserved flag before we - * drop the spin lock. - */ - must_insert_reserved = locked_ref->must_insert_reserved; - locked_ref->must_insert_reserved = 0; - - /* - * locked_ref is the head node, so we have to go one - * node back for any delayed ref updates - */ - ref = select_delayed_ref(locked_ref); - if (!ref) { - /* All delayed refs have been processed, Go ahead - * and send the head node to run_one_delayed_ref, - * so that any accounting fixes can happen - */ - ref = &locked_ref->node; - list_del_init(&locked_ref->cluster); - locked_ref = NULL; - } - - ref->in_tree = 0; - rb_erase(&ref->rb_node, &delayed_refs->root); - delayed_refs->num_entries--; - spin_unlock(&delayed_refs->lock); - - ret = run_one_delayed_ref(trans, root, ref, - must_insert_reserved); - BUG_ON(ret); - btrfs_put_delayed_ref(ref); - - count++; - cond_resched(); - spin_lock(&delayed_refs->lock); - } - return count; -} - -/* - * this starts processing the delayed reference count updates and - * extent insertions we have queued up so far. count can be - * 0, which means to process everything in the tree at the start - * of the run (but not newly added entries), or it can be some target - * number you'd like to process. - */ -int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, - struct btrfs_root *root, unsigned long count) -{ - struct rb_node *node; - struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_delayed_ref_node *ref; - struct list_head cluster; - int ret; - int run_all = count == (unsigned long)-1; - int run_most = 0; - - if (root == root->fs_info->extent_root) - root = root->fs_info->tree_root; - - delayed_refs = &trans->transaction->delayed_refs; - INIT_LIST_HEAD(&cluster); -again: - spin_lock(&delayed_refs->lock); - if (count == 0) { - count = delayed_refs->num_entries * 2; - run_most = 1; - } - while (1) { - if (!(run_all || run_most) && - delayed_refs->num_heads_ready < 64) - break; - - /* - * go find something we can process in the rbtree. We start at - * the beginning of the tree, and then build a cluster - * of refs to process starting at the first one we are able to - * lock - */ - ret = btrfs_find_ref_cluster(trans, &cluster, - delayed_refs->run_delayed_start); - if (ret) - break; - - ret = run_clustered_refs(trans, root, &cluster); - BUG_ON(ret < 0); +int btrfs_extent_post_op(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + u64 start; + u64 end; + int ret; - count -= min_t(unsigned long, ret, count); + while(1) { + finish_current_insert(trans, root->fs_info->extent_root, 1); + del_pending_extents(trans, root->fs_info->extent_root, 1); - if (count == 0) - break; + /* is there more work to do? */ + ret = find_first_extent_bit(&root->fs_info->pending_del, + 0, &start, &end, EXTENT_WRITEBACK); + if (!ret) + continue; + ret = find_first_extent_bit(&root->fs_info->extent_ins, + 0, &start, &end, EXTENT_WRITEBACK); + if (!ret) + continue; + break; } + return 0; +} - if (run_all) { - node = rb_first(&delayed_refs->root); - if (!node) - goto out; - count = (unsigned long)-1; - - while (node) { - ref = rb_entry(node, struct btrfs_delayed_ref_node, - rb_node); - if (btrfs_delayed_ref_is_head(ref)) { - struct btrfs_delayed_ref_head *head; - - head = btrfs_delayed_node_to_head(ref); - atomic_inc(&ref->refs); - - spin_unlock(&delayed_refs->lock); - mutex_lock(&head->mutex); - mutex_unlock(&head->mutex); +int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u32 *refs) +{ + struct btrfs_path *path; + int ret; + struct btrfs_key key; + struct extent_buffer *l; + struct btrfs_extent_item *item; - btrfs_put_delayed_ref(ref); - cond_resched(); - goto again; - } - node = rb_next(node); - } - spin_unlock(&delayed_refs->lock); - schedule_timeout(1); - goto again; + WARN_ON(num_bytes < root->sectorsize); + path = btrfs_alloc_path(); + path->reada = 1; + key.objectid = bytenr; + key.offset = num_bytes; + btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); + ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path, + 0, 0); + if (ret < 0) + goto out; + if (ret != 0) { + btrfs_print_leaf(root, path->nodes[0]); + printk(KERN_INFO "btrfs failed to find block number %llu\n", + (unsigned long long)bytenr); + BUG(); } + l = path->nodes[0]; + item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item); + *refs = btrfs_extent_refs(l, item); out: - spin_unlock(&delayed_refs->lock); + btrfs_free_path(path); return 0; } @@ -1316,7 +1624,7 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans, int refi = 0; int slot; int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, - u64, u64, u64, u64, u64, u64, u64, u64, u64); + u64, u64, u64, u64, u64, u64, u64, u64); ref_root = btrfs_header_owner(buf); ref_generation = btrfs_header_generation(buf); @@ -1388,19 +1696,12 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans, if (level == 0) { btrfs_item_key_to_cpu(buf, &key, slot); - fi = btrfs_item_ptr(buf, slot, - struct btrfs_file_extent_item); - - bytenr = btrfs_file_extent_disk_bytenr(buf, fi); - if (bytenr == 0) - continue; ret = process_func(trans, root, bytenr, - btrfs_file_extent_disk_num_bytes(buf, fi), - orig_buf->start, buf->start, - orig_root, ref_root, - orig_generation, ref_generation, - key.objectid); + orig_buf->start, buf->start, + orig_root, ref_root, + orig_generation, ref_generation, + key.objectid); if (ret) { faili = slot; @@ -1408,7 +1709,7 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans, goto fail; } } else { - ret = process_func(trans, root, bytenr, buf->len, + ret = process_func(trans, root, bytenr, orig_buf->start, buf->start, orig_root, ref_root, orig_generation, ref_generation, @@ -1485,17 +1786,17 @@ int btrfs_update_ref(struct btrfs_trans_handle *trans, if (bytenr == 0) continue; ret = __btrfs_update_extent_ref(trans, root, bytenr, - btrfs_file_extent_disk_num_bytes(buf, fi), - orig_buf->start, buf->start, - orig_root, ref_root, orig_generation, - ref_generation, key.objectid); + orig_buf->start, buf->start, + orig_root, ref_root, + orig_generation, ref_generation, + key.objectid); if (ret) goto fail; } else { bytenr = btrfs_node_blockptr(buf, slot); ret = __btrfs_update_extent_ref(trans, root, bytenr, - buf->len, orig_buf->start, - buf->start, orig_root, ref_root, + orig_buf->start, buf->start, + orig_root, ref_root, orig_generation, ref_generation, level - 1); if (ret) @@ -1514,6 +1815,7 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *cache) { int ret; + int pending_ret; struct btrfs_root *extent_root = root->fs_info->extent_root; unsigned long bi; struct extent_buffer *leaf; @@ -1529,8 +1831,12 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(extent_root, path); fail: + finish_current_insert(trans, extent_root, 0); + pending_ret = del_pending_extents(trans, extent_root, 0); if (ret) return ret; + if (pending_ret) + return pending_ret; return 0; } @@ -2055,8 +2361,6 @@ int btrfs_update_pinned_extents(struct btrfs_root *root, clear_extent_dirty(&fs_info->pinned_extents, bytenr, bytenr + num - 1, GFP_NOFS); } - mutex_unlock(&root->fs_info->pinned_mutex); - while (num > 0) { cache = btrfs_lookup_block_group(fs_info, bytenr); BUG_ON(!cache); @@ -2148,8 +2452,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, u64 end; int ret; + mutex_lock(&root->fs_info->pinned_mutex); while (1) { - mutex_lock(&root->fs_info->pinned_mutex); ret = find_first_extent_bit(unpin, 0, &start, &end, EXTENT_DIRTY); if (ret) @@ -2157,21 +2461,209 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, ret = btrfs_discard_extent(root, start, end + 1 - start); - /* unlocks the pinned mutex */ btrfs_update_pinned_extents(root, start, end + 1 - start, 0); clear_extent_dirty(unpin, start, end, GFP_NOFS); - cond_resched(); + if (need_resched()) { + mutex_unlock(&root->fs_info->pinned_mutex); + cond_resched(); + mutex_lock(&root->fs_info->pinned_mutex); + } } mutex_unlock(&root->fs_info->pinned_mutex); return ret; } +static int finish_current_insert(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, int all) +{ + u64 start; + u64 end; + u64 priv; + u64 search = 0; + struct btrfs_fs_info *info = extent_root->fs_info; + struct btrfs_path *path; + struct pending_extent_op *extent_op, *tmp; + struct list_head insert_list, update_list; + int ret; + int num_inserts = 0, max_inserts, restart = 0; + + path = btrfs_alloc_path(); + INIT_LIST_HEAD(&insert_list); + INIT_LIST_HEAD(&update_list); + + max_inserts = extent_root->leafsize / + (2 * sizeof(struct btrfs_key) + 2 * sizeof(struct btrfs_item) + + sizeof(struct btrfs_extent_ref) + + sizeof(struct btrfs_extent_item)); +again: + mutex_lock(&info->extent_ins_mutex); + while (1) { + ret = find_first_extent_bit(&info->extent_ins, search, &start, + &end, EXTENT_WRITEBACK); + if (ret) { + if (restart && !num_inserts && + list_empty(&update_list)) { + restart = 0; + search = 0; + continue; + } + break; + } + + ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS); + if (!ret) { + if (all) + restart = 1; + search = end + 1; + if (need_resched()) { + mutex_unlock(&info->extent_ins_mutex); + cond_resched(); + mutex_lock(&info->extent_ins_mutex); + } + continue; + } + + ret = get_state_private(&info->extent_ins, start, &priv); + BUG_ON(ret); + extent_op = (struct pending_extent_op *)(unsigned long) priv; + + if (extent_op->type == PENDING_EXTENT_INSERT) { + num_inserts++; + list_add_tail(&extent_op->list, &insert_list); + search = end + 1; + if (num_inserts == max_inserts) { + restart = 1; + break; + } + } else if (extent_op->type == PENDING_BACKREF_UPDATE) { + list_add_tail(&extent_op->list, &update_list); + search = end + 1; + } else { + BUG(); + } + } + + /* + * process the update list, clear the writeback bit for it, and if + * somebody marked this thing for deletion then just unlock it and be + * done, the free_extents will handle it + */ + list_for_each_entry_safe(extent_op, tmp, &update_list, list) { + clear_extent_bits(&info->extent_ins, extent_op->bytenr, + extent_op->bytenr + extent_op->num_bytes - 1, + EXTENT_WRITEBACK, GFP_NOFS); + if (extent_op->del) { + list_del_init(&extent_op->list); + unlock_extent(&info->extent_ins, extent_op->bytenr, + extent_op->bytenr + extent_op->num_bytes + - 1, GFP_NOFS); + kfree(extent_op); + } + } + mutex_unlock(&info->extent_ins_mutex); + + /* + * still have things left on the update list, go ahead an update + * everything + */ + if (!list_empty(&update_list)) { + ret = update_backrefs(trans, extent_root, path, &update_list); + BUG_ON(ret); + + /* we may have COW'ed new blocks, so lets start over */ + if (all) + restart = 1; + } + + /* + * if no inserts need to be done, but we skipped some extents and we + * need to make sure everything is cleaned then reset everything and + * go back to the beginning + */ + if (!num_inserts && restart) { + search = 0; + restart = 0; + INIT_LIST_HEAD(&update_list); + INIT_LIST_HEAD(&insert_list); + goto again; + } else if (!num_inserts) { + goto out; + } + + /* + * process the insert extents list. Again if we are deleting this + * extent, then just unlock it, pin down the bytes if need be, and be + * done with it. Saves us from having to actually insert the extent + * into the tree and then subsequently come along and delete it + */ + mutex_lock(&info->extent_ins_mutex); + list_for_each_entry_safe(extent_op, tmp, &insert_list, list) { + clear_extent_bits(&info->extent_ins, extent_op->bytenr, + extent_op->bytenr + extent_op->num_bytes - 1, + EXTENT_WRITEBACK, GFP_NOFS); + if (extent_op->del) { + u64 used; + list_del_init(&extent_op->list); + unlock_extent(&info->extent_ins, extent_op->bytenr, + extent_op->bytenr + extent_op->num_bytes + - 1, GFP_NOFS); + + mutex_lock(&extent_root->fs_info->pinned_mutex); + ret = pin_down_bytes(trans, extent_root, + extent_op->bytenr, + extent_op->num_bytes, 0); + mutex_unlock(&extent_root->fs_info->pinned_mutex); + + spin_lock(&info->delalloc_lock); + used = btrfs_super_bytes_used(&info->super_copy); + btrfs_set_super_bytes_used(&info->super_copy, + used - extent_op->num_bytes); + used = btrfs_root_used(&extent_root->root_item); + btrfs_set_root_used(&extent_root->root_item, + used - extent_op->num_bytes); + spin_unlock(&info->delalloc_lock); + + ret = update_block_group(trans, extent_root, + extent_op->bytenr, + extent_op->num_bytes, + 0, ret > 0); + BUG_ON(ret); + kfree(extent_op); + num_inserts--; + } + } + mutex_unlock(&info->extent_ins_mutex); + + ret = insert_extents(trans, extent_root, path, &insert_list, + num_inserts); + BUG_ON(ret); + + /* + * if restart is set for whatever reason we need to go back and start + * searching through the pending list again. + * + * We just inserted some extents, which could have resulted in new + * blocks being allocated, which would result in new blocks needing + * updates, so if all is set we _must_ restart to get the updated + * blocks. + */ + if (restart || all) { + INIT_LIST_HEAD(&insert_list); + INIT_LIST_HEAD(&update_list); + search = 0; + restart = 0; + num_inserts = 0; + goto again; + } +out: + btrfs_free_path(path); + return 0; +} + static int pin_down_bytes(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_path *path, - u64 bytenr, u64 num_bytes, int is_data, - struct extent_buffer **must_clean) + u64 bytenr, u64 num_bytes, int is_data) { int err = 0; struct extent_buffer *buf; @@ -2194,19 +2686,17 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans, u64 header_transid = btrfs_header_generation(buf); if (header_owner != BTRFS_TREE_LOG_OBJECTID && header_owner != BTRFS_TREE_RELOC_OBJECTID && - header_owner != BTRFS_DATA_RELOC_TREE_OBJECTID && header_transid == trans->transid && !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { - *must_clean = buf; + clean_tree_block(NULL, root, buf); + btrfs_tree_unlock(buf); + free_extent_buffer(buf); return 1; } btrfs_tree_unlock(buf); } free_extent_buffer(buf); pinit: - btrfs_set_path_blocking(path); - mutex_lock(&root->fs_info->pinned_mutex); - /* unlocks the pinned mutex */ btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); BUG_ON(err < 0); @@ -2220,8 +2710,7 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 ref_generation, - u64 owner_objectid, int pin, int mark_free, - int refs_to_drop) + u64 owner_objectid, int pin, int mark_free) { struct btrfs_path *path; struct btrfs_key key; @@ -2243,7 +2732,6 @@ static int __free_extent(struct btrfs_trans_handle *trans, return -ENOMEM; path->reada = 1; - path->leave_spinning = 1; ret = lookup_extent_backref(trans, extent_root, path, bytenr, parent, root_objectid, ref_generation, owner_objectid, 1); @@ -2265,11 +2753,9 @@ static int __free_extent(struct btrfs_trans_handle *trans, break; } if (!found_extent) { - ret = remove_extent_backref(trans, extent_root, path, - refs_to_drop); + ret = remove_extent_backref(trans, extent_root, path); BUG_ON(ret); btrfs_release_path(extent_root, path); - path->leave_spinning = 1; ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1); if (ret) { @@ -2285,9 +2771,8 @@ static int __free_extent(struct btrfs_trans_handle *trans, btrfs_print_leaf(extent_root, path->nodes[0]); WARN_ON(1); printk(KERN_ERR "btrfs unable to find ref byte nr %llu " - "parent %llu root %llu gen %llu owner %llu\n", + "root %llu gen %llu owner %llu\n", (unsigned long long)bytenr, - (unsigned long long)parent, (unsigned long long)root_objectid, (unsigned long long)ref_generation, (unsigned long long)owner_objectid); @@ -2297,23 +2782,17 @@ static int __free_extent(struct btrfs_trans_handle *trans, ei = btrfs_item_ptr(leaf, extent_slot, struct btrfs_extent_item); refs = btrfs_extent_refs(leaf, ei); - - /* - * we're not allowed to delete the extent item if there - * are other delayed ref updates pending - */ - - BUG_ON(refs < refs_to_drop); - refs -= refs_to_drop; + BUG_ON(refs == 0); + refs -= 1; btrfs_set_extent_refs(leaf, ei, refs); + btrfs_mark_buffer_dirty(leaf); - if (refs == 0 && found_extent && - path->slots[0] == extent_slot + 1) { + if (refs == 0 && found_extent && path->slots[0] == extent_slot + 1) { struct btrfs_extent_ref *ref; ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref); - BUG_ON(btrfs_ref_num_refs(leaf, ref) != refs_to_drop); + BUG_ON(btrfs_ref_num_refs(leaf, ref) != 1); /* if the back ref and the extent are next to each other * they get deleted below in one shot */ @@ -2321,13 +2800,11 @@ static int __free_extent(struct btrfs_trans_handle *trans, num_to_del = 2; } else if (found_extent) { /* otherwise delete the extent back ref */ - ret = remove_extent_backref(trans, extent_root, path, - refs_to_drop); + ret = remove_extent_backref(trans, extent_root, path); BUG_ON(ret); /* if refs are 0, we need to setup the path for deletion */ if (refs == 0) { btrfs_release_path(extent_root, path); - path->leave_spinning = 1; ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1); BUG_ON(ret); @@ -2337,18 +2814,16 @@ static int __free_extent(struct btrfs_trans_handle *trans, if (refs == 0) { u64 super_used; u64 root_used; - struct extent_buffer *must_clean = NULL; if (pin) { - ret = pin_down_bytes(trans, root, path, - bytenr, num_bytes, - owner_objectid >= BTRFS_FIRST_FREE_OBJECTID, - &must_clean); + mutex_lock(&root->fs_info->pinned_mutex); + ret = pin_down_bytes(trans, root, bytenr, num_bytes, + owner_objectid >= BTRFS_FIRST_FREE_OBJECTID); + mutex_unlock(&root->fs_info->pinned_mutex); if (ret > 0) mark_free = 1; BUG_ON(ret < 0); } - /* block accounting for super block */ spin_lock(&info->delalloc_lock); super_used = btrfs_super_bytes_used(&info->super_copy); @@ -2360,34 +2835,14 @@ static int __free_extent(struct btrfs_trans_handle *trans, btrfs_set_root_used(&root->root_item, root_used - num_bytes); spin_unlock(&info->delalloc_lock); - - /* - * it is going to be very rare for someone to be waiting - * on the block we're freeing. del_items might need to - * schedule, so rather than get fancy, just force it - * to blocking here - */ - if (must_clean) - btrfs_set_lock_blocking(must_clean); - ret = btrfs_del_items(trans, extent_root, path, path->slots[0], num_to_del); BUG_ON(ret); btrfs_release_path(extent_root, path); - if (must_clean) { - clean_tree_block(NULL, root, must_clean); - btrfs_tree_unlock(must_clean); - free_extent_buffer(must_clean); - } - if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_del_csums(trans, root, bytenr, num_bytes); BUG_ON(ret); - } else { - invalidate_mapping_pages(info->btree_inode->i_mapping, - bytenr >> PAGE_CACHE_SHIFT, - (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT); } ret = update_block_group(trans, root, bytenr, num_bytes, 0, @@ -2395,103 +2850,218 @@ static int __free_extent(struct btrfs_trans_handle *trans, BUG_ON(ret); } btrfs_free_path(path); + finish_current_insert(trans, extent_root, 0); return ret; } /* - * remove an extent from the root, returns 0 on success + * find all the blocks marked as pending in the radix tree and remove + * them from the extent map */ -static int __btrfs_free_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 ref_generation, - u64 owner_objectid, int pin, - int refs_to_drop) +static int del_pending_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, int all) { - WARN_ON(num_bytes < root->sectorsize); + int ret; + int err = 0; + u64 start; + u64 end; + u64 priv; + u64 search = 0; + int nr = 0, skipped = 0; + struct extent_io_tree *pending_del; + struct extent_io_tree *extent_ins; + struct pending_extent_op *extent_op; + struct btrfs_fs_info *info = extent_root->fs_info; + struct list_head delete_list; + + INIT_LIST_HEAD(&delete_list); + extent_ins = &extent_root->fs_info->extent_ins; + pending_del = &extent_root->fs_info->pending_del; - /* - * if metadata always pin - * if data pin when any transaction has committed this - */ - if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID || - ref_generation != trans->transid) - pin = 1; +again: + mutex_lock(&info->extent_ins_mutex); + while (1) { + ret = find_first_extent_bit(pending_del, search, &start, &end, + EXTENT_WRITEBACK); + if (ret) { + if (all && skipped && !nr) { + search = 0; + skipped = 0; + continue; + } + mutex_unlock(&info->extent_ins_mutex); + break; + } - if (ref_generation != trans->transid) - pin = 1; + ret = try_lock_extent(extent_ins, start, end, GFP_NOFS); + if (!ret) { + search = end+1; + skipped = 1; - return __free_extent(trans, root, bytenr, num_bytes, parent, - root_objectid, ref_generation, - owner_objectid, pin, pin == 0, refs_to_drop); + if (need_resched()) { + mutex_unlock(&info->extent_ins_mutex); + cond_resched(); + mutex_lock(&info->extent_ins_mutex); + } + + continue; + } + BUG_ON(ret < 0); + + ret = get_state_private(pending_del, start, &priv); + BUG_ON(ret); + extent_op = (struct pending_extent_op *)(unsigned long)priv; + + clear_extent_bits(pending_del, start, end, EXTENT_WRITEBACK, + GFP_NOFS); + if (!test_range_bit(extent_ins, start, end, + EXTENT_WRITEBACK, 0)) { + list_add_tail(&extent_op->list, &delete_list); + nr++; + } else { + kfree(extent_op); + + ret = get_state_private(&info->extent_ins, start, + &priv); + BUG_ON(ret); + extent_op = (struct pending_extent_op *) + (unsigned long)priv; + + clear_extent_bits(&info->extent_ins, start, end, + EXTENT_WRITEBACK, GFP_NOFS); + + if (extent_op->type == PENDING_BACKREF_UPDATE) { + list_add_tail(&extent_op->list, &delete_list); + search = end + 1; + nr++; + continue; + } + + mutex_lock(&extent_root->fs_info->pinned_mutex); + ret = pin_down_bytes(trans, extent_root, start, + end + 1 - start, 0); + mutex_unlock(&extent_root->fs_info->pinned_mutex); + + ret = update_block_group(trans, extent_root, start, + end + 1 - start, 0, ret > 0); + + unlock_extent(extent_ins, start, end, GFP_NOFS); + BUG_ON(ret); + kfree(extent_op); + } + if (ret) + err = ret; + + search = end + 1; + + if (need_resched()) { + mutex_unlock(&info->extent_ins_mutex); + cond_resched(); + mutex_lock(&info->extent_ins_mutex); + } + } + + if (nr) { + ret = free_extents(trans, extent_root, &delete_list); + BUG_ON(ret); + } + + if (all && skipped) { + INIT_LIST_HEAD(&delete_list); + search = 0; + nr = 0; + goto again; + } + + if (!err) + finish_current_insert(trans, extent_root, 0); + return err; } /* - * when we free an extent, it is possible (and likely) that we free the last - * delayed ref for that extent as well. This searches the delayed ref tree for - * a given extent, and if there are no other delayed refs to be processed, it - * removes it from the tree. + * remove an extent from the root, returns 0 on success */ -static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr) +static int __btrfs_free_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 ref_generation, + u64 owner_objectid, int pin) { - struct btrfs_delayed_ref_head *head; - struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_delayed_ref_node *ref; - struct rb_node *node; + struct btrfs_root *extent_root = root->fs_info->extent_root; + int pending_ret; int ret; - delayed_refs = &trans->transaction->delayed_refs; - spin_lock(&delayed_refs->lock); - head = btrfs_find_delayed_ref_head(trans, bytenr); - if (!head) - goto out; - - node = rb_prev(&head->node.rb_node); - if (!node) - goto out; - - ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - - /* there are still entries for this ref, we can't drop it */ - if (ref->bytenr == bytenr) - goto out; + WARN_ON(num_bytes < root->sectorsize); + if (root == extent_root) { + struct pending_extent_op *extent_op = NULL; + + mutex_lock(&root->fs_info->extent_ins_mutex); + if (test_range_bit(&root->fs_info->extent_ins, bytenr, + bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) { + u64 priv; + ret = get_state_private(&root->fs_info->extent_ins, + bytenr, &priv); + BUG_ON(ret); + extent_op = (struct pending_extent_op *) + (unsigned long)priv; - /* - * waiting for the lock here would deadlock. If someone else has it - * locked they are already in the process of dropping it anyway - */ - if (!mutex_trylock(&head->mutex)) - goto out; + extent_op->del = 1; + if (extent_op->type == PENDING_EXTENT_INSERT) { + mutex_unlock(&root->fs_info->extent_ins_mutex); + return 0; + } + } - /* - * at this point we have a head with no other entries. Go - * ahead and process it. - */ - head->node.in_tree = 0; - rb_erase(&head->node.rb_node, &delayed_refs->root); + if (extent_op) { + ref_generation = extent_op->orig_generation; + parent = extent_op->orig_parent; + } - delayed_refs->num_entries--; + extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); + BUG_ON(!extent_op); + + extent_op->type = PENDING_EXTENT_DELETE; + extent_op->bytenr = bytenr; + extent_op->num_bytes = num_bytes; + extent_op->parent = parent; + extent_op->orig_parent = parent; + extent_op->generation = ref_generation; + extent_op->orig_generation = ref_generation; + extent_op->level = (int)owner_objectid; + INIT_LIST_HEAD(&extent_op->list); + extent_op->del = 0; + + set_extent_bits(&root->fs_info->pending_del, + bytenr, bytenr + num_bytes - 1, + EXTENT_WRITEBACK, GFP_NOFS); + set_state_private(&root->fs_info->pending_del, + bytenr, (unsigned long)extent_op); + mutex_unlock(&root->fs_info->extent_ins_mutex); + return 0; + } + /* if metadata always pin */ + if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) { + if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { + mutex_lock(&root->fs_info->pinned_mutex); + btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); + mutex_unlock(&root->fs_info->pinned_mutex); + update_reserved_extents(root, bytenr, num_bytes, 0); + return 0; + } + pin = 1; + } - /* - * we don't take a ref on the node because we're removing it from the - * tree, so we just steal the ref the tree was holding. - */ - delayed_refs->num_heads--; - if (list_empty(&head->cluster)) - delayed_refs->num_heads_ready--; + /* if data pin when any transaction has committed this */ + if (ref_generation != trans->transid) + pin = 1; - list_del_init(&head->cluster); - spin_unlock(&delayed_refs->lock); + ret = __free_extent(trans, root, bytenr, num_bytes, parent, + root_objectid, ref_generation, + owner_objectid, pin, pin == 0); - ret = run_one_delayed_ref(trans, root->fs_info->tree_root, - &head->node, head->must_insert_reserved); - BUG_ON(ret); - btrfs_put_delayed_ref(&head->node); - return 0; -out: - spin_unlock(&delayed_refs->lock); - return 0; + finish_current_insert(trans, root->fs_info->extent_root, 0); + pending_ret = del_pending_extents(trans, root->fs_info->extent_root, 0); + return ret ? ret : pending_ret; } int btrfs_free_extent(struct btrfs_trans_handle *trans, @@ -2502,30 +3072,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, { int ret; - /* - * tree log blocks never actually go into the extent allocation - * tree, just update pinning info and exit early. - * - * data extents referenced by the tree log do need to have - * their reference counts bumped. - */ - if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID && - owner_objectid < BTRFS_FIRST_FREE_OBJECTID) { - mutex_lock(&root->fs_info->pinned_mutex); - - /* unlocks the pinned mutex */ - btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); - update_reserved_extents(root, bytenr, num_bytes, 0); - ret = 0; - } else { - ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent, - root_objectid, ref_generation, - owner_objectid, - BTRFS_DROP_DELAYED_REF, 1); - BUG_ON(ret); - ret = check_ref_cleanup(trans, root, bytenr); - BUG_ON(ret); - } + ret = __btrfs_free_extent(trans, root, bytenr, num_bytes, parent, + root_objectid, ref_generation, + owner_objectid, pin); return ret; } @@ -2926,10 +3475,10 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans, static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 parent, u64 root_objectid, u64 ref_generation, - u64 owner, struct btrfs_key *ins, - int ref_mod) + u64 owner, struct btrfs_key *ins) { int ret; + int pending_ret; u64 super_used; u64 root_used; u64 num_bytes = ins->offset; @@ -2954,6 +3503,33 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, btrfs_set_root_used(&root->root_item, root_used + num_bytes); spin_unlock(&info->delalloc_lock); + if (root == extent_root) { + struct pending_extent_op *extent_op; + + extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); + BUG_ON(!extent_op); + + extent_op->type = PENDING_EXTENT_INSERT; + extent_op->bytenr = ins->objectid; + extent_op->num_bytes = ins->offset; + extent_op->parent = parent; + extent_op->orig_parent = 0; + extent_op->generation = ref_generation; + extent_op->orig_generation = 0; + extent_op->level = (int)owner; + INIT_LIST_HEAD(&extent_op->list); + extent_op->del = 0; + + mutex_lock(&root->fs_info->extent_ins_mutex); + set_extent_bits(&root->fs_info->extent_ins, ins->objectid, + ins->objectid + ins->offset - 1, + EXTENT_WRITEBACK, GFP_NOFS); + set_state_private(&root->fs_info->extent_ins, + ins->objectid, (unsigned long)extent_op); + mutex_unlock(&root->fs_info->extent_ins_mutex); + goto update_block; + } + memcpy(&keys[0], ins, sizeof(*ins)); keys[1].objectid = ins->objectid; keys[1].type = BTRFS_EXTENT_REF_KEY; @@ -2964,31 +3540,37 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); BUG_ON(!path); - path->leave_spinning = 1; ret = btrfs_insert_empty_items(trans, extent_root, path, keys, sizes, 2); BUG_ON(ret); extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item); - btrfs_set_extent_refs(path->nodes[0], extent_item, ref_mod); + btrfs_set_extent_refs(path->nodes[0], extent_item, 1); ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, struct btrfs_extent_ref); btrfs_set_ref_root(path->nodes[0], ref, root_objectid); btrfs_set_ref_generation(path->nodes[0], ref, ref_generation); btrfs_set_ref_objectid(path->nodes[0], ref, owner); - btrfs_set_ref_num_refs(path->nodes[0], ref, ref_mod); + btrfs_set_ref_num_refs(path->nodes[0], ref, 1); btrfs_mark_buffer_dirty(path->nodes[0]); trans->alloc_exclude_start = 0; trans->alloc_exclude_nr = 0; btrfs_free_path(path); + finish_current_insert(trans, extent_root, 0); + pending_ret = del_pending_extents(trans, extent_root, 0); if (ret) goto out; + if (pending_ret) { + ret = pending_ret; + goto out; + } +update_block: ret = update_block_group(trans, root, ins->objectid, ins->offset, 1, 0); if (ret) { @@ -3010,12 +3592,9 @@ int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, if (root_objectid == BTRFS_TREE_LOG_OBJECTID) return 0; - - ret = btrfs_add_delayed_ref(trans, ins->objectid, - ins->offset, parent, root_objectid, - ref_generation, owner, - BTRFS_ADD_DELAYED_EXTENT, 0); - BUG_ON(ret); + ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid, + ref_generation, owner, ins); + update_reserved_extents(root, ins->objectid, ins->offset, 0); return ret; } @@ -3042,7 +3621,7 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans, BUG_ON(ret); put_block_group(block_group); ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid, - ref_generation, owner, ins, 1); + ref_generation, owner, ins); return ret; } @@ -3061,18 +3640,20 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans, u64 search_end, struct btrfs_key *ins, u64 data) { int ret; + ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size, empty_size, hint_byte, search_end, ins, data); BUG_ON(ret); if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { - ret = btrfs_add_delayed_ref(trans, ins->objectid, - ins->offset, parent, root_objectid, - ref_generation, owner_objectid, - BTRFS_ADD_DELAYED_EXTENT, 0); + ret = __btrfs_alloc_reserved_extent(trans, root, parent, + root_objectid, ref_generation, + owner_objectid, ins); BUG_ON(ret); + + } else { + update_reserved_extents(root, ins->objectid, ins->offset, 1); } - update_reserved_extents(root, ins->objectid, ins->offset, 1); return ret; } @@ -3208,7 +3789,7 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); - ret = btrfs_free_extent(trans, root, disk_bytenr, + ret = __btrfs_free_extent(trans, root, disk_bytenr, btrfs_file_extent_disk_num_bytes(leaf, fi), leaf->start, leaf_owner, leaf_generation, key.objectid, 0); @@ -3248,7 +3829,7 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans, */ for (i = 0; i < ref->nritems; i++) { info = ref->extents + sorted[i].slot; - ret = btrfs_free_extent(trans, root, info->bytenr, + ret = __btrfs_free_extent(trans, root, info->bytenr, info->num_bytes, ref->bytenr, ref->owner, ref->generation, info->objectid, 0); @@ -3265,13 +3846,12 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans, return 0; } -static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 start, +static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start, u64 len, u32 *refs) { int ret; - ret = btrfs_lookup_extent_ref(trans, root, start, len, refs); + ret = btrfs_lookup_extent_ref(NULL, root, start, len, refs); BUG_ON(ret); #if 0 /* some debugging code in case we see problems here */ @@ -3379,8 +3959,7 @@ static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans, * we just decrement it below and don't update any * of the refs the leaf points to. */ - ret = drop_snap_lookup_refcount(trans, root, bytenr, - blocksize, &refs); + ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs); BUG_ON(ret); if (refs != 1) continue; @@ -3431,7 +4010,7 @@ static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans, */ for (i = 0; i < refi; i++) { bytenr = sorted[i].bytenr; - ret = btrfs_free_extent(trans, root, bytenr, + ret = __btrfs_free_extent(trans, root, bytenr, blocksize, eb->start, root_owner, root_gen, 0, 1); BUG_ON(ret); @@ -3474,7 +4053,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, WARN_ON(*level < 0); WARN_ON(*level >= BTRFS_MAX_LEVEL); - ret = drop_snap_lookup_refcount(trans, root, path->nodes[*level]->start, + ret = drop_snap_lookup_refcount(root, path->nodes[*level]->start, path->nodes[*level]->len, &refs); BUG_ON(ret); if (refs > 1) @@ -3525,8 +4104,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); blocksize = btrfs_level_size(root, *level - 1); - ret = drop_snap_lookup_refcount(trans, root, bytenr, - blocksize, &refs); + ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs); BUG_ON(ret); /* @@ -3541,7 +4119,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, root_gen = btrfs_header_generation(parent); path->slots[*level]++; - ret = btrfs_free_extent(trans, root, bytenr, + ret = __btrfs_free_extent(trans, root, bytenr, blocksize, parent->start, root_owner, root_gen, *level - 1, 1); @@ -3587,7 +4165,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, * cleanup and free the reference on the last node * we processed */ - ret = btrfs_free_extent(trans, root, bytenr, blocksize, + ret = __btrfs_free_extent(trans, root, bytenr, blocksize, parent->start, root_owner, root_gen, *level, 1); free_extent_buffer(path->nodes[*level]); @@ -3776,7 +4354,6 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root struct btrfs_path *path; int i; int orig_level; - int update_count; struct btrfs_root_item *root_item = &root->root_item; WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex)); @@ -3818,7 +4395,6 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root } } while (1) { - unsigned long update; wret = walk_down_tree(trans, root, path, &level); if (wret > 0) break; @@ -3831,21 +4407,12 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root break; if (wret < 0) ret = wret; - if (trans->transaction->in_commit || - trans->transaction->delayed_refs.flushing) { + if (trans->transaction->in_commit) { ret = -EAGAIN; break; } atomic_inc(&root->fs_info->throttle_gen); wake_up(&root->fs_info->transaction_throttle); - for (update_count = 0; update_count < 16; update_count++) { - update = trans->delayed_ref_updates; - trans->delayed_ref_updates = 0; - if (update) - btrfs_run_delayed_refs(trans, root, update); - else - break; - } } for (i = 0; i <= orig_level; i++) { if (path->nodes[i]) { @@ -4890,7 +5457,6 @@ static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans, root->root_key.objectid, trans->transid, key.objectid); BUG_ON(ret); - ret = btrfs_free_extent(trans, root, bytenr, num_bytes, leaf->start, btrfs_header_owner(leaf), @@ -5202,6 +5768,9 @@ static noinline int relocate_tree_block(struct btrfs_trans_handle *trans, ref_path, NULL, NULL); BUG_ON(ret); + if (root == root->fs_info->extent_root) + btrfs_extent_post_op(trans, root); + return 0; } @@ -5469,7 +6038,6 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - path->leave_spinning = 1; ret = btrfs_insert_empty_inode(trans, root, path, objectid); if (ret) goto out; @@ -5640,9 +6208,6 @@ int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start) btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1); mutex_unlock(&root->fs_info->cleaner_mutex); - trans = btrfs_start_transaction(info->tree_root, 1); - btrfs_commit_transaction(trans, info->tree_root); - while (1) { ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) @@ -5901,7 +6466,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, extent_root = root->fs_info->extent_root; - root->fs_info->last_trans_log_full_commit = trans->transid; + root->fs_info->last_trans_new_blockgroup = trans->transid; cache = kzalloc(sizeof(*cache), GFP_NOFS); if (!cache) @@ -5935,6 +6500,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, sizeof(cache->item)); BUG_ON(ret); + finish_current_insert(trans, extent_root, 0); + ret = del_pending_extents(trans, extent_root, 0); + BUG_ON(ret); set_avail_alloc_bits(extent_root->fs_info, type); return 0; diff --git a/trunk/fs/btrfs/extent_io.c b/trunk/fs/btrfs/extent_io.c index 08085af089e2..ebe6b29e6069 100644 --- a/trunk/fs/btrfs/extent_io.c +++ b/trunk/fs/btrfs/extent_io.c @@ -3124,15 +3124,20 @@ void free_extent_buffer(struct extent_buffer *eb) int clear_extent_buffer_dirty(struct extent_io_tree *tree, struct extent_buffer *eb) { + int set; unsigned long i; unsigned long num_pages; struct page *page; + u64 start = eb->start; + u64 end = start + eb->len - 1; + + set = clear_extent_dirty(tree, start, end, GFP_NOFS); num_pages = num_extent_pages(eb->start, eb->len); for (i = 0; i < num_pages; i++) { page = extent_buffer_page(eb, i); - if (!PageDirty(page)) + if (!set && !PageDirty(page)) continue; lock_page(page); @@ -3141,6 +3146,22 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree, else set_page_private(page, EXTENT_PAGE_PRIVATE); + /* + * if we're on the last page or the first page and the + * block isn't aligned on a page boundary, do extra checks + * to make sure we don't clean page that is partially dirty + */ + if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || + ((i == num_pages - 1) && + ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) { + start = (u64)page->index << PAGE_CACHE_SHIFT; + end = start + PAGE_CACHE_SIZE - 1; + if (test_range_bit(tree, start, end, + EXTENT_DIRTY, 0)) { + unlock_page(page); + continue; + } + } clear_page_dirty_for_io(page); spin_lock_irq(&page->mapping->tree_lock); if (!PageDirty(page)) { @@ -3166,13 +3187,29 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree, { unsigned long i; unsigned long num_pages; - int was_dirty = 0; - was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); num_pages = num_extent_pages(eb->start, eb->len); - for (i = 0; i < num_pages; i++) + for (i = 0; i < num_pages; i++) { + struct page *page = extent_buffer_page(eb, i); + /* writepage may need to do something special for the + * first page, we have to make sure page->private is + * properly set. releasepage may drop page->private + * on us if the page isn't already dirty. + */ + lock_page(page); + if (i == 0) { + set_page_extent_head(page, eb->len); + } else if (PagePrivate(page) && + page->private != EXTENT_PAGE_PRIVATE) { + set_page_extent_mapped(page); + } __set_page_dirty_nobuffers(extent_buffer_page(eb, i)); - return was_dirty; + set_extent_dirty(tree, page_offset(page), + page_offset(page) + PAGE_CACHE_SIZE - 1, + GFP_NOFS); + unlock_page(page); + } + return 0; } int clear_extent_buffer_uptodate(struct extent_io_tree *tree, @@ -3752,10 +3789,6 @@ int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page) ret = 0; goto out; } - if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { - ret = 0; - goto out; - } /* at this point we can safely release the extent buffer */ num_pages = num_extent_pages(eb->start, eb->len); for (i = 0; i < num_pages; i++) diff --git a/trunk/fs/btrfs/extent_io.h b/trunk/fs/btrfs/extent_io.h index 5bc20abf3f3d..1f9df88afbf6 100644 --- a/trunk/fs/btrfs/extent_io.h +++ b/trunk/fs/btrfs/extent_io.h @@ -25,7 +25,6 @@ /* these are bit numbers for test/set bit */ #define EXTENT_BUFFER_UPTODATE 0 #define EXTENT_BUFFER_BLOCKING 1 -#define EXTENT_BUFFER_DIRTY 2 /* * page->private values. Every page that is controlled by the extent @@ -255,8 +254,6 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree, struct extent_buffer *eb); int set_extent_buffer_dirty(struct extent_io_tree *tree, struct extent_buffer *eb); -int test_extent_buffer_dirty(struct extent_io_tree *tree, - struct extent_buffer *eb); int set_extent_buffer_uptodate(struct extent_io_tree *tree, struct extent_buffer *eb); int clear_extent_buffer_uptodate(struct extent_io_tree *tree, diff --git a/trunk/fs/btrfs/file-item.c b/trunk/fs/btrfs/file-item.c index 9b99886562d0..964652435fd1 100644 --- a/trunk/fs/btrfs/file-item.c +++ b/trunk/fs/btrfs/file-item.c @@ -52,7 +52,6 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, file_key.offset = pos; btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY); - path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, root, path, &file_key, sizeof(*item)); if (ret < 0) @@ -524,7 +523,6 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, key.offset = end_byte - 1; key.type = BTRFS_EXTENT_CSUM_KEY; - path->leave_spinning = 1; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) { if (path->slots[0] == 0) @@ -759,10 +757,8 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, } else { ins_size = csum_size; } - path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, root, path, &file_key, ins_size); - path->leave_spinning = 0; if (ret < 0) goto fail_unlock; if (ret != 0) { @@ -780,6 +776,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, item_end = (struct btrfs_csum_item *)((unsigned char *)item_end + btrfs_item_size_nr(leaf, path->slots[0])); eb_token = NULL; + cond_resched(); next_sector: if (!eb_token || @@ -820,9 +817,9 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, eb_token = NULL; } btrfs_mark_buffer_dirty(path->nodes[0]); + cond_resched(); if (total_bytes < sums->len) { btrfs_release_path(root, path); - cond_resched(); goto again; } out: diff --git a/trunk/fs/btrfs/file.c b/trunk/fs/btrfs/file.c index 9c9fb46ccd08..dc78954861b3 100644 --- a/trunk/fs/btrfs/file.c +++ b/trunk/fs/btrfs/file.c @@ -606,7 +606,6 @@ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans, btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY); btrfs_release_path(root, path); - path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*extent)); BUG_ON(ret); @@ -640,22 +639,17 @@ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans, ram_bytes); btrfs_set_file_extent_type(leaf, extent, found_type); - btrfs_unlock_up_safe(path, 1); btrfs_mark_buffer_dirty(path->nodes[0]); - btrfs_set_lock_blocking(path->nodes[0]); if (disk_bytenr != 0) { ret = btrfs_update_extent_ref(trans, root, - disk_bytenr, - le64_to_cpu(old.disk_num_bytes), - orig_parent, + disk_bytenr, orig_parent, leaf->start, root->root_key.objectid, trans->transid, ins.objectid); BUG_ON(ret); } - path->leave_spinning = 0; btrfs_release_path(root, path); if (disk_bytenr != 0) inode_add_bytes(inode, extent_end - end); @@ -918,7 +912,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, btrfs_set_file_extent_other_encoding(leaf, fi, 0); if (orig_parent != leaf->start) { - ret = btrfs_update_extent_ref(trans, root, bytenr, num_bytes, + ret = btrfs_update_extent_ref(trans, root, bytenr, orig_parent, leaf->start, root->root_key.objectid, trans->transid, inode->i_ino); @@ -1161,20 +1155,6 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, page_cache_release(pinned[1]); *ppos = pos; - /* - * we want to make sure fsync finds this change - * but we haven't joined a transaction running right now. - * - * Later on, someone is sure to update the inode and get the - * real transid recorded. - * - * We set last_trans now to the fs_info generation + 1, - * this will either be one more than the running transaction - * or the generation used for the next transaction if there isn't - * one running right now. - */ - BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; - if (num_written > 0 && will_write) { struct btrfs_trans_handle *trans; @@ -1187,11 +1167,8 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, ret = btrfs_log_dentry_safe(trans, root, file->f_dentry); if (ret == 0) { - ret = btrfs_sync_log(trans, root); - if (ret == 0) - btrfs_end_transaction(trans, root); - else - btrfs_commit_transaction(trans, root); + btrfs_sync_log(trans, root); + btrfs_end_transaction(trans, root); } else { btrfs_commit_transaction(trans, root); } @@ -1208,18 +1185,6 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, int btrfs_release_file(struct inode *inode, struct file *filp) { - /* - * ordered_data_close is set by settattr when we are about to truncate - * a file from a non-zero size to a zero size. This tries to - * flush down new bytes that may have been written if the - * application were using truncate to replace a file in place. - */ - if (BTRFS_I(inode)->ordered_data_close) { - BTRFS_I(inode)->ordered_data_close = 0; - btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode); - if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) - filemap_flush(inode->i_mapping); - } if (filp->private_data) btrfs_ioctl_trans_end(filp); return 0; @@ -1295,11 +1260,8 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) if (ret > 0) { ret = btrfs_commit_transaction(trans, root); } else { - ret = btrfs_sync_log(trans, root); - if (ret == 0) - ret = btrfs_end_transaction(trans, root); - else - ret = btrfs_commit_transaction(trans, root); + btrfs_sync_log(trans, root); + ret = btrfs_end_transaction(trans, root); } mutex_lock(&dentry->d_inode->i_mutex); out: diff --git a/trunk/fs/btrfs/inode-item.c b/trunk/fs/btrfs/inode-item.c index 6b627c611808..3d46fa1f29a4 100644 --- a/trunk/fs/btrfs/inode-item.c +++ b/trunk/fs/btrfs/inode-item.c @@ -73,8 +73,6 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - path->leave_spinning = 1; - ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) { ret = -ENOENT; @@ -129,7 +127,6 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, root, path, &key, ins_len); if (ret == -EEXIST) { diff --git a/trunk/fs/btrfs/inode.c b/trunk/fs/btrfs/inode.c index 06d8db5afb08..17e608c4dc70 100644 --- a/trunk/fs/btrfs/inode.c +++ b/trunk/fs/btrfs/inode.c @@ -134,7 +134,6 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - path->leave_spinning = 1; btrfs_set_trans_block_group(trans, inode); key.objectid = inode->i_ino; @@ -168,9 +167,9 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, cur_size = min_t(unsigned long, compressed_size, PAGE_CACHE_SIZE); - kaddr = kmap_atomic(cpage, KM_USER0); + kaddr = kmap(cpage); write_extent_buffer(leaf, kaddr, ptr, cur_size); - kunmap_atomic(kaddr, KM_USER0); + kunmap(cpage); i++; ptr += cur_size; @@ -205,7 +204,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, * does the checks required to make sure the data is small enough * to fit as an inline extent. */ -static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, +static int cow_file_range_inline(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, u64 start, u64 end, size_t compressed_size, @@ -855,6 +854,11 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, u64 cur_end; int limit = 10 * 1024 * 1042; + if (!btrfs_test_opt(root, COMPRESS)) { + return cow_file_range(inode, locked_page, start, end, + page_started, nr_written, 1); + } + clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED | EXTENT_DELALLOC, 1, 0, GFP_NOFS); while (start < end) { @@ -931,8 +935,7 @@ static noinline int csum_exist_in_range(struct btrfs_root *root, * If no cow copies or snapshots exist, we write directly to the existing * blocks on disk */ -static noinline int run_delalloc_nocow(struct inode *inode, - struct page *locked_page, +static int run_delalloc_nocow(struct inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, int force, unsigned long *nr_written) { @@ -1130,7 +1133,6 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, unsigned long *nr_written) { int ret; - struct btrfs_root *root = BTRFS_I(inode)->root; if (btrfs_test_flag(inode, NODATACOW)) ret = run_delalloc_nocow(inode, locked_page, start, end, @@ -1138,12 +1140,10 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, else if (btrfs_test_flag(inode, PREALLOC)) ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 0, nr_written); - else if (!btrfs_test_opt(root, COMPRESS)) - ret = cow_file_range(inode, locked_page, start, end, - page_started, nr_written, 1); else ret = cow_file_range_async(inode, locked_page, start, end, page_started, nr_written); + return ret; } @@ -1453,7 +1453,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); BUG_ON(!path); - path->leave_spinning = 1; ret = btrfs_drop_extents(trans, root, inode, file_pos, file_pos + num_bytes, file_pos, &hint); BUG_ON(ret); @@ -1476,10 +1475,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_set_file_extent_compression(leaf, fi, compression); btrfs_set_file_extent_encryption(leaf, fi, encryption); btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding); - - btrfs_unlock_up_safe(path, 1); - btrfs_set_lock_blocking(leaf); - btrfs_mark_buffer_dirty(leaf); inode_add_bytes(inode, num_bytes); @@ -1492,35 +1487,11 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, root->root_key.objectid, trans->transid, inode->i_ino, &ins); BUG_ON(ret); - btrfs_free_path(path); + btrfs_free_path(path); return 0; } -/* - * helper function for btrfs_finish_ordered_io, this - * just reads in some of the csum leaves to prime them into ram - * before we start the transaction. It limits the amount of btree - * reads required while inside the transaction. - */ -static noinline void reada_csum(struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_ordered_extent *ordered_extent) -{ - struct btrfs_ordered_sum *sum; - u64 bytenr; - - sum = list_entry(ordered_extent->list.next, struct btrfs_ordered_sum, - list); - bytenr = sum->sums[0].bytenr; - - /* - * we don't care about the results, the point of this search is - * just to get the btree leaves into ram - */ - btrfs_lookup_csum(NULL, root->fs_info->csum_root, path, bytenr, 0); -} - /* as ordered data IO finishes, this gets called so we can finish * an ordered extent if the range of bytes in the file it covers are * fully written. @@ -1529,9 +1500,8 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; - struct btrfs_ordered_extent *ordered_extent = NULL; + struct btrfs_ordered_extent *ordered_extent; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - struct btrfs_path *path; int compressed = 0; int ret; @@ -1539,33 +1509,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) if (!ret) return 0; - /* - * before we join the transaction, try to do some of our IO. - * This will limit the amount of IO that we have to do with - * the transaction running. We're unlikely to need to do any - * IO if the file extents are new, the disk_i_size checks - * covers the most common case. - */ - if (start < BTRFS_I(inode)->disk_i_size) { - path = btrfs_alloc_path(); - if (path) { - ret = btrfs_lookup_file_extent(NULL, root, path, - inode->i_ino, - start, 0); - ordered_extent = btrfs_lookup_ordered_extent(inode, - start); - if (!list_empty(&ordered_extent->list)) { - btrfs_release_path(root, path); - reada_csum(root, path, ordered_extent); - } - btrfs_free_path(path); - } - } - trans = btrfs_join_transaction(root, 1); - if (!ordered_extent) - ordered_extent = btrfs_lookup_ordered_extent(inode, start); + ordered_extent = btrfs_lookup_ordered_extent(inode, start); BUG_ON(!ordered_extent); if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) goto nocow; @@ -2155,7 +2101,6 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); BUG_ON(!path); - path->leave_spinning = 1; ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location, 1); if (ret) { @@ -2202,7 +2147,6 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans, goto err; } - path->leave_spinning = 1; di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, name, name_len, -1); if (IS_ERR(di)) { @@ -2246,6 +2190,8 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans, ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode, dir->i_ino); BUG_ON(ret != 0 && ret != -ENOENT); + if (ret != -ENOENT) + BTRFS_I(dir)->log_dirty_trans = trans->transid; ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir, index); @@ -2278,9 +2224,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) trans = btrfs_start_transaction(root, 1); btrfs_set_trans_block_group(trans, dir); - - btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0); - ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, dentry->d_name.name, dentry->d_name.len); @@ -2555,7 +2498,6 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, key.type = (u8)-1; search_again: - path->leave_spinning = 1; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) goto error; @@ -2702,7 +2644,6 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, break; } if (found_extent) { - btrfs_set_path_blocking(path); ret = btrfs_free_extent(trans, root, extent_start, extent_num_bytes, leaf->start, root_owner, @@ -2907,21 +2848,11 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) if (err) return err; - if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { - if (attr->ia_size > inode->i_size) { - err = btrfs_cont_expand(inode, attr->ia_size); - if (err) - return err; - } else if (inode->i_size > 0 && - attr->ia_size == 0) { - - /* we're truncating a file that used to have good - * data down to zero. Make sure it gets into - * the ordered flush list so that any new writes - * get down to disk quickly. - */ - BTRFS_I(inode)->ordered_data_close = 1; - } + if (S_ISREG(inode->i_mode) && + attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) { + err = btrfs_cont_expand(inode, attr->ia_size); + if (err) + return err; } err = inode_setattr(inode, attr); @@ -3053,14 +2984,13 @@ static noinline void init_btrfs_i(struct inode *inode) bi->disk_i_size = 0; bi->flags = 0; bi->index_cnt = (u64)-1; - bi->last_unlink_trans = 0; + bi->log_dirty_trans = 0; extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS); extent_io_tree_init(&BTRFS_I(inode)->io_tree, inode->i_mapping, GFP_NOFS); extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree, inode->i_mapping, GFP_NOFS); INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes); - INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations); btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree); mutex_init(&BTRFS_I(inode)->extent_mutex); mutex_init(&BTRFS_I(inode)->log_mutex); @@ -3519,7 +3449,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, sizes[0] = sizeof(struct btrfs_inode_item); sizes[1] = name_len + sizeof(*ref); - path->leave_spinning = 1; ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2); if (ret != 0) goto fail; @@ -3798,8 +3727,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, drop_inode = 1; nr = trans->blocks_used; - - btrfs_log_new_name(trans, inode, NULL, dentry->d_parent); btrfs_end_transaction_throttle(trans, root); fail: if (drop_inode) { @@ -4436,8 +4363,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) } ClearPageChecked(page); set_page_dirty(page); - - BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; unlock_extent(io_tree, page_start, page_end, GFP_NOFS); out_unlock: @@ -4463,27 +4388,6 @@ static void btrfs_truncate(struct inode *inode) btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); trans = btrfs_start_transaction(root, 1); - - /* - * setattr is responsible for setting the ordered_data_close flag, - * but that is only tested during the last file release. That - * could happen well after the next commit, leaving a great big - * window where new writes may get lost if someone chooses to write - * to this file after truncating to zero - * - * The inode doesn't have any dirty data here, and so if we commit - * this is a noop. If someone immediately starts writing to the inode - * it is very likely we'll catch some of their writes in this - * transaction, and the commit will find this file on the ordered - * data list with good things to send down. - * - * This is a best effort solution, there is still a window where - * using truncate to replace the contents of the file will - * end up with a zero length file after a crash. - */ - if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close) - btrfs_add_ordered_operation(trans, root, inode); - btrfs_set_trans_block_group(trans, inode); btrfs_i_size_write(inode, inode->i_size); @@ -4560,15 +4464,12 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->i_acl = BTRFS_ACL_NOT_CACHED; ei->i_default_acl = BTRFS_ACL_NOT_CACHED; INIT_LIST_HEAD(&ei->i_orphan); - INIT_LIST_HEAD(&ei->ordered_operations); return &ei->vfs_inode; } void btrfs_destroy_inode(struct inode *inode) { struct btrfs_ordered_extent *ordered; - struct btrfs_root *root = BTRFS_I(inode)->root; - WARN_ON(!list_empty(&inode->i_dentry)); WARN_ON(inode->i_data.nrpages); @@ -4579,24 +4480,13 @@ void btrfs_destroy_inode(struct inode *inode) BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED) posix_acl_release(BTRFS_I(inode)->i_default_acl); - /* - * Make sure we're properly removed from the ordered operation - * lists. - */ - smp_mb(); - if (!list_empty(&BTRFS_I(inode)->ordered_operations)) { - spin_lock(&root->fs_info->ordered_extent_lock); - list_del_init(&BTRFS_I(inode)->ordered_operations); - spin_unlock(&root->fs_info->ordered_extent_lock); - } - - spin_lock(&root->list_lock); + spin_lock(&BTRFS_I(inode)->root->list_lock); if (!list_empty(&BTRFS_I(inode)->i_orphan)) { printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan" " list\n", inode->i_ino); dump_stack(); } - spin_unlock(&root->list_lock); + spin_unlock(&BTRFS_I(inode)->root->list_lock); while (1) { ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); @@ -4721,36 +4611,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (ret) goto out_unlock; - /* - * we're using rename to replace one file with another. - * and the replacement file is large. Start IO on it now so - * we don't add too much work to the end of the transaction - */ - if (new_inode && old_inode && S_ISREG(old_inode->i_mode) && - new_inode->i_size && - old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) - filemap_flush(old_inode->i_mapping); - trans = btrfs_start_transaction(root, 1); - /* - * make sure the inode gets flushed if it is replacing - * something. - */ - if (new_inode && new_inode->i_size && - old_inode && S_ISREG(old_inode->i_mode)) { - btrfs_add_ordered_operation(trans, root, old_inode); - } - - /* - * this is an ugly little race, but the rename is required to make - * sure that if we crash, the inode is either at the old name - * or the new one. pinning the log transaction lets us make sure - * we don't allow a log commit to come in after we unlink the - * name but before we add the new name back in. - */ - btrfs_pin_log_trans(root); - btrfs_set_trans_block_group(trans, new_dir); btrfs_inc_nlink(old_dentry->d_inode); @@ -4758,9 +4620,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, new_dir->i_ctime = new_dir->i_mtime = ctime; old_inode->i_ctime = ctime; - if (old_dentry->d_parent != new_dentry->d_parent) - btrfs_record_unlink_dir(trans, old_dir, old_inode, 1); - ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode, old_dentry->d_name.name, old_dentry->d_name.len); @@ -4792,14 +4651,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (ret) goto out_fail; - btrfs_log_new_name(trans, old_inode, old_dir, - new_dentry->d_parent); out_fail: - - /* this btrfs_end_log_trans just allows the current - * log-sub transaction to complete - */ - btrfs_end_log_trans(root); btrfs_end_transaction_throttle(trans, root); out_unlock: return ret; diff --git a/trunk/fs/btrfs/locking.c b/trunk/fs/btrfs/locking.c index a5310c0f41e2..47b0a88c12a2 100644 --- a/trunk/fs/btrfs/locking.c +++ b/trunk/fs/btrfs/locking.c @@ -71,13 +71,12 @@ void btrfs_clear_lock_blocking(struct extent_buffer *eb) static int btrfs_spin_on_block(struct extent_buffer *eb) { int i; - for (i = 0; i < 512; i++) { + cpu_relax(); if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) return 1; if (need_resched()) break; - cpu_relax(); } return 0; } @@ -96,15 +95,13 @@ int btrfs_try_spin_lock(struct extent_buffer *eb) { int i; - if (btrfs_spin_on_block(eb)) { - spin_nested(eb); - if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) - return 1; - spin_unlock(&eb->lock); - } + spin_nested(eb); + if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) + return 1; + spin_unlock(&eb->lock); + /* spin for a bit on the BLOCKING flag */ for (i = 0; i < 2; i++) { - cpu_relax(); if (!btrfs_spin_on_block(eb)) break; @@ -151,9 +148,6 @@ int btrfs_tree_lock(struct extent_buffer *eb) DEFINE_WAIT(wait); wait.func = btrfs_wake_function; - if (!btrfs_spin_on_block(eb)) - goto sleep; - while(1) { spin_nested(eb); @@ -171,10 +165,9 @@ int btrfs_tree_lock(struct extent_buffer *eb) * spin for a bit, and if the blocking flag goes away, * loop around */ - cpu_relax(); if (btrfs_spin_on_block(eb)) continue; -sleep: + prepare_to_wait_exclusive(&eb->lock_wq, &wait, TASK_UNINTERRUPTIBLE); diff --git a/trunk/fs/btrfs/ordered-data.c b/trunk/fs/btrfs/ordered-data.c index 53c87b197d70..77c2411a5f0f 100644 --- a/trunk/fs/btrfs/ordered-data.c +++ b/trunk/fs/btrfs/ordered-data.c @@ -310,16 +310,6 @@ int btrfs_remove_ordered_extent(struct inode *inode, spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); list_del_init(&entry->root_extent_list); - - /* - * we have no more ordered extents for this inode and - * no dirty pages. We can safely remove it from the - * list of ordered extents - */ - if (RB_EMPTY_ROOT(&tree->tree) && - !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) { - list_del_init(&BTRFS_I(inode)->ordered_operations); - } spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); mutex_unlock(&tree->mutex); @@ -379,68 +369,6 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only) return 0; } -/* - * this is used during transaction commit to write all the inodes - * added to the ordered operation list. These files must be fully on - * disk before the transaction commits. - * - * we have two modes here, one is to just start the IO via filemap_flush - * and the other is to wait for all the io. When we wait, we have an - * extra check to make sure the ordered operation list really is empty - * before we return - */ -int btrfs_run_ordered_operations(struct btrfs_root *root, int wait) -{ - struct btrfs_inode *btrfs_inode; - struct inode *inode; - struct list_head splice; - - INIT_LIST_HEAD(&splice); - - mutex_lock(&root->fs_info->ordered_operations_mutex); - spin_lock(&root->fs_info->ordered_extent_lock); -again: - list_splice_init(&root->fs_info->ordered_operations, &splice); - - while (!list_empty(&splice)) { - btrfs_inode = list_entry(splice.next, struct btrfs_inode, - ordered_operations); - - inode = &btrfs_inode->vfs_inode; - - list_del_init(&btrfs_inode->ordered_operations); - - /* - * the inode may be getting freed (in sys_unlink path). - */ - inode = igrab(inode); - - if (!wait && inode) { - list_add_tail(&BTRFS_I(inode)->ordered_operations, - &root->fs_info->ordered_operations); - } - spin_unlock(&root->fs_info->ordered_extent_lock); - - if (inode) { - if (wait) - btrfs_wait_ordered_range(inode, 0, (u64)-1); - else - filemap_flush(inode->i_mapping); - iput(inode); - } - - cond_resched(); - spin_lock(&root->fs_info->ordered_extent_lock); - } - if (wait && !list_empty(&root->fs_info->ordered_operations)) - goto again; - - spin_unlock(&root->fs_info->ordered_extent_lock); - mutex_unlock(&root->fs_info->ordered_operations_mutex); - - return 0; -} - /* * Used to start IO or wait for a given ordered extent to finish. * @@ -798,49 +726,3 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping, return ret; } - -/* - * add a given inode to the list of inodes that must be fully on - * disk before a transaction commit finishes. - * - * This basically gives us the ext3 style data=ordered mode, and it is mostly - * used to make sure renamed files are fully on disk. - * - * It is a noop if the inode is already fully on disk. - * - * If trans is not null, we'll do a friendly check for a transaction that - * is already flushing things and force the IO down ourselves. - */ -int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *inode) -{ - u64 last_mod; - - last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans); - - /* - * if this file hasn't been changed since the last transaction - * commit, we can safely return without doing anything - */ - if (last_mod < root->fs_info->last_trans_committed) - return 0; - - /* - * the transaction is already committing. Just start the IO and - * don't bother with all of this list nonsense - */ - if (trans && root->fs_info->running_transaction->blocked) { - btrfs_wait_ordered_range(inode, 0, (u64)-1); - return 0; - } - - spin_lock(&root->fs_info->ordered_extent_lock); - if (list_empty(&BTRFS_I(inode)->ordered_operations)) { - list_add_tail(&BTRFS_I(inode)->ordered_operations, - &root->fs_info->ordered_operations); - } - spin_unlock(&root->fs_info->ordered_extent_lock); - - return 0; -} diff --git a/trunk/fs/btrfs/ordered-data.h b/trunk/fs/btrfs/ordered-data.h index 3d31c8827b01..ab66d5e8d6d6 100644 --- a/trunk/fs/btrfs/ordered-data.h +++ b/trunk/fs/btrfs/ordered-data.h @@ -155,8 +155,4 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping, int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start, loff_t end, int sync_mode); int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only); -int btrfs_run_ordered_operations(struct btrfs_root *root, int wait); -int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *inode); #endif diff --git a/trunk/fs/btrfs/transaction.c b/trunk/fs/btrfs/transaction.c index 664782c6a2df..4112d53d4f4d 100644 --- a/trunk/fs/btrfs/transaction.c +++ b/trunk/fs/btrfs/transaction.c @@ -65,15 +65,6 @@ static noinline int join_transaction(struct btrfs_root *root) cur_trans->use_count = 1; cur_trans->commit_done = 0; cur_trans->start_time = get_seconds(); - - cur_trans->delayed_refs.root.rb_node = NULL; - cur_trans->delayed_refs.num_entries = 0; - cur_trans->delayed_refs.num_heads_ready = 0; - cur_trans->delayed_refs.num_heads = 0; - cur_trans->delayed_refs.flushing = 0; - cur_trans->delayed_refs.run_delayed_start = 0; - spin_lock_init(&cur_trans->delayed_refs.lock); - INIT_LIST_HEAD(&cur_trans->pending_snapshots); list_add_tail(&cur_trans->list, &root->fs_info->trans_list); extent_io_tree_init(&cur_trans->dirty_pages, @@ -191,8 +182,6 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, h->block_group = 0; h->alloc_exclude_nr = 0; h->alloc_exclude_start = 0; - h->delayed_ref_updates = 0; - root->fs_info->running_transaction->use_count++; mutex_unlock(&root->fs_info->trans_mutex); return h; @@ -282,6 +271,7 @@ void btrfs_throttle(struct btrfs_root *root) if (!root->fs_info->open_ioctl_trans) wait_current_trans(root); mutex_unlock(&root->fs_info->trans_mutex); + throttle_on_drops(root); } @@ -290,27 +280,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, { struct btrfs_transaction *cur_trans; struct btrfs_fs_info *info = root->fs_info; - int count = 0; - - while (count < 4) { - unsigned long cur = trans->delayed_ref_updates; - trans->delayed_ref_updates = 0; - if (cur && - trans->transaction->delayed_refs.num_heads_ready > 64) { - trans->delayed_ref_updates = 0; - - /* - * do a full flush if the transaction is trying - * to close - */ - if (trans->transaction->delayed_refs.flushing) - cur = 0; - btrfs_run_delayed_refs(trans, root, cur); - } else { - break; - } - count++; - } mutex_lock(&info->trans_mutex); cur_trans = info->running_transaction; @@ -455,10 +424,9 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, u64 old_root_bytenr; struct btrfs_root *tree_root = root->fs_info->tree_root; + btrfs_extent_post_op(trans, root); btrfs_write_dirty_block_groups(trans, root); - - ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); - BUG_ON(ret); + btrfs_extent_post_op(trans, root); while (1) { old_root_bytenr = btrfs_root_bytenr(&root->root_item); @@ -470,14 +438,14 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, btrfs_header_level(root->node)); btrfs_set_root_generation(&root->root_item, trans->transid); + btrfs_extent_post_op(trans, root); + ret = btrfs_update_root(trans, tree_root, &root->root_key, &root->root_item); BUG_ON(ret); btrfs_write_dirty_block_groups(trans, root); - - ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); - BUG_ON(ret); + btrfs_extent_post_op(trans, root); } return 0; } @@ -491,18 +459,15 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = root->fs_info; struct list_head *next; struct extent_buffer *eb; - int ret; - ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); - BUG_ON(ret); + btrfs_extent_post_op(trans, fs_info->tree_root); eb = btrfs_lock_root_node(fs_info->tree_root); - btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb); + btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb, 0); btrfs_tree_unlock(eb); free_extent_buffer(eb); - ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); - BUG_ON(ret); + btrfs_extent_post_op(trans, fs_info->tree_root); while (!list_empty(&fs_info->dirty_cowonly_roots)) { next = fs_info->dirty_cowonly_roots.next; @@ -510,9 +475,6 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, root = list_entry(next, struct btrfs_root, dirty_list); update_cowonly_root(trans, root); - - ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); - BUG_ON(ret); } return 0; } @@ -672,31 +634,6 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) return 0; } -/* - * when dropping snapshots, we generate a ton of delayed refs, and it makes - * sense not to join the transaction while it is trying to flush the current - * queue of delayed refs out. - * - * This is used by the drop snapshot code only - */ -static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info) -{ - DEFINE_WAIT(wait); - - mutex_lock(&info->trans_mutex); - while (info->running_transaction && - info->running_transaction->delayed_refs.flushing) { - prepare_to_wait(&info->transaction_wait, &wait, - TASK_UNINTERRUPTIBLE); - mutex_unlock(&info->trans_mutex); - schedule(); - mutex_lock(&info->trans_mutex); - finish_wait(&info->transaction_wait, &wait); - } - mutex_unlock(&info->trans_mutex); - return 0; -} - /* * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on * all of them @@ -724,22 +661,7 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root, atomic_inc(&root->fs_info->throttles); while (1) { - /* - * we don't want to jump in and create a bunch of - * delayed refs if the transaction is starting to close - */ - wait_transaction_pre_flush(tree_root->fs_info); trans = btrfs_start_transaction(tree_root, 1); - - /* - * we've joined a transaction, make sure it isn't - * closing right now - */ - if (trans->transaction->delayed_refs.flushing) { - btrfs_end_transaction(trans, tree_root); - continue; - } - mutex_lock(&root->fs_info->drop_mutex); ret = btrfs_drop_snapshot(trans, dirty->root); if (ret != -EAGAIN) @@ -844,7 +766,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); old = btrfs_lock_root_node(root); - btrfs_cow_block(trans, root, old, NULL, 0, &old); + btrfs_cow_block(trans, root, old, NULL, 0, &old, 0); btrfs_copy_root(trans, root, old, &tmp, objectid); btrfs_tree_unlock(old); @@ -972,31 +894,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct extent_io_tree *pinned_copy; DEFINE_WAIT(wait); int ret; - int should_grow = 0; - unsigned long now = get_seconds(); - - btrfs_run_ordered_operations(root, 0); - - /* make a pass through all the delayed refs we have so far - * any runnings procs may add more while we are here - */ - ret = btrfs_run_delayed_refs(trans, root, 0); - BUG_ON(ret); - - cur_trans = trans->transaction; - /* - * set the flushing flag so procs in this transaction have to - * start sending their work down. - */ - cur_trans->delayed_refs.flushing = 1; - - ret = btrfs_run_delayed_refs(trans, root, 0); - BUG_ON(ret); - mutex_lock(&root->fs_info->trans_mutex); INIT_LIST_HEAD(&dirty_fs_roots); - if (cur_trans->in_commit) { - cur_trans->use_count++; + mutex_lock(&root->fs_info->trans_mutex); + if (trans->transaction->in_commit) { + cur_trans = trans->transaction; + trans->transaction->use_count++; mutex_unlock(&root->fs_info->trans_mutex); btrfs_end_transaction(trans, root); @@ -1019,6 +922,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, trans->transaction->in_commit = 1; trans->transaction->blocked = 1; + cur_trans = trans->transaction; if (cur_trans->list.prev != &root->fs_info->trans_list) { prev_trans = list_entry(cur_trans->list.prev, struct btrfs_transaction, list); @@ -1033,9 +937,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, } } - if (now < cur_trans->start_time || now - cur_trans->start_time < 1) - should_grow = 1; - do { int snap_pending = 0; joined = cur_trans->num_joined; @@ -1048,7 +949,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, if (cur_trans->num_writers > 1) timeout = MAX_SCHEDULE_TIMEOUT; - else if (should_grow) + else timeout = 1; mutex_unlock(&root->fs_info->trans_mutex); @@ -1058,30 +959,16 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, BUG_ON(ret); } - /* - * rename don't use btrfs_join_transaction, so, once we - * set the transaction to blocked above, we aren't going - * to get any new ordered operations. We can safely run - * it here and no for sure that nothing new will be added - * to the list - */ - btrfs_run_ordered_operations(root, 1); - - smp_mb(); - if (cur_trans->num_writers > 1 || should_grow) - schedule_timeout(timeout); + schedule_timeout(timeout); mutex_lock(&root->fs_info->trans_mutex); finish_wait(&cur_trans->writer_wait, &wait); } while (cur_trans->num_writers > 1 || - (should_grow && cur_trans->num_joined != joined)); + (cur_trans->num_joined != joined)); ret = create_pending_snapshots(trans, root->fs_info); BUG_ON(ret); - ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); - BUG_ON(ret); - WARN_ON(cur_trans != trans->transaction); /* btrfs_commit_tree_roots is responsible for getting the @@ -1145,7 +1032,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_copy_pinned(root, pinned_copy); trans->transaction->blocked = 0; - wake_up(&root->fs_info->transaction_throttle); wake_up(&root->fs_info->transaction_wait); @@ -1172,7 +1058,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, mutex_lock(&root->fs_info->trans_mutex); cur_trans->commit_done = 1; - root->fs_info->last_trans_committed = cur_trans->transid; wake_up(&cur_trans->commit_wait); diff --git a/trunk/fs/btrfs/transaction.h b/trunk/fs/btrfs/transaction.h index 94f5bde2b58d..ea292117f882 100644 --- a/trunk/fs/btrfs/transaction.h +++ b/trunk/fs/btrfs/transaction.h @@ -19,16 +19,10 @@ #ifndef __BTRFS_TRANSACTION__ #define __BTRFS_TRANSACTION__ #include "btrfs_inode.h" -#include "delayed-ref.h" struct btrfs_transaction { u64 transid; - /* - * total writers in this transaction, it must be zero before the - * transaction can end - */ unsigned long num_writers; - unsigned long num_joined; int in_commit; int use_count; @@ -40,7 +34,6 @@ struct btrfs_transaction { wait_queue_head_t writer_wait; wait_queue_head_t commit_wait; struct list_head pending_snapshots; - struct btrfs_delayed_ref_root delayed_refs; }; struct btrfs_trans_handle { @@ -51,7 +44,6 @@ struct btrfs_trans_handle { u64 block_group; u64 alloc_exclude_start; u64 alloc_exclude_nr; - unsigned long delayed_ref_updates; }; struct btrfs_pending_snapshot { diff --git a/trunk/fs/btrfs/tree-defrag.c b/trunk/fs/btrfs/tree-defrag.c index b10eacdb1620..98d25fa4570e 100644 --- a/trunk/fs/btrfs/tree-defrag.c +++ b/trunk/fs/btrfs/tree-defrag.c @@ -124,6 +124,8 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, } btrfs_release_path(root, path); + if (is_extent) + btrfs_extent_post_op(trans, root); out: if (path) btrfs_free_path(path); diff --git a/trunk/fs/btrfs/tree-log.c b/trunk/fs/btrfs/tree-log.c index fc9b87a7975b..9c462fbd60fa 100644 --- a/trunk/fs/btrfs/tree-log.c +++ b/trunk/fs/btrfs/tree-log.c @@ -34,49 +34,6 @@ #define LOG_INODE_ALL 0 #define LOG_INODE_EXISTS 1 -/* - * directory trouble cases - * - * 1) on rename or unlink, if the inode being unlinked isn't in the fsync - * log, we must force a full commit before doing an fsync of the directory - * where the unlink was done. - * ---> record transid of last unlink/rename per directory - * - * mkdir foo/some_dir - * normal commit - * rename foo/some_dir foo2/some_dir - * mkdir foo/some_dir - * fsync foo/some_dir/some_file - * - * The fsync above will unlink the original some_dir without recording - * it in its new location (foo2). After a crash, some_dir will be gone - * unless the fsync of some_file forces a full commit - * - * 2) we must log any new names for any file or dir that is in the fsync - * log. ---> check inode while renaming/linking. - * - * 2a) we must log any new names for any file or dir during rename - * when the directory they are being removed from was logged. - * ---> check inode and old parent dir during rename - * - * 2a is actually the more important variant. With the extra logging - * a crash might unlink the old name without recreating the new one - * - * 3) after a crash, we must go through any directories with a link count - * of zero and redo the rm -rf - * - * mkdir f1/foo - * normal commit - * rm -rf f1/foo - * fsync(f1) - * - * The directory f1 was fully removed from the FS, but fsync was never - * called on f1, only its parent dir. After a crash the rm -rf must - * be replayed. This must be able to recurse down the entire - * directory tree. The inode link count fixup code takes care of the - * ugly details. - */ - /* * stages for the tree walking. The first * stage (0) is to only pin down the blocks we find @@ -90,17 +47,12 @@ #define LOG_WALK_REPLAY_INODES 1 #define LOG_WALK_REPLAY_ALL 2 -static int btrfs_log_inode(struct btrfs_trans_handle *trans, +static int __btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, int inode_only); static int link_to_fixup_dir(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid); -static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_root *log, - struct btrfs_path *path, - u64 dirid, int del_all); /* * tree logging is a special write ahead log used to make sure that @@ -180,26 +132,11 @@ static int join_running_log_trans(struct btrfs_root *root) return ret; } -/* - * This either makes the current running log transaction wait - * until you call btrfs_end_log_trans() or it makes any future - * log transactions wait until you call btrfs_end_log_trans() - */ -int btrfs_pin_log_trans(struct btrfs_root *root) -{ - int ret = -ENOENT; - - mutex_lock(&root->log_mutex); - atomic_inc(&root->log_writers); - mutex_unlock(&root->log_mutex); - return ret; -} - /* * indicate we're done making changes to the log tree * and wake up anyone waiting to do a sync */ -int btrfs_end_log_trans(struct btrfs_root *root) +static int end_log_trans(struct btrfs_root *root) { if (atomic_dec_and_test(&root->log_writers)) { smp_mb(); @@ -266,6 +203,7 @@ static int process_one_buffer(struct btrfs_root *log, mutex_lock(&log->fs_info->pinned_mutex); btrfs_update_pinned_extents(log->fs_info->extent_root, eb->start, eb->len, 1); + mutex_unlock(&log->fs_info->pinned_mutex); } if (btrfs_buffer_uptodate(eb, gen)) { @@ -665,7 +603,6 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, ret = link_to_fixup_dir(trans, root, path, location.objectid); BUG_ON(ret); - ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); BUG_ON(ret); kfree(name); @@ -867,7 +804,6 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, victim_name_len)) { btrfs_inc_nlink(inode); btrfs_release_path(root, path); - ret = btrfs_unlink_inode(trans, root, dir, inode, victim_name, victim_name_len); @@ -986,20 +922,13 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, key.offset--; btrfs_release_path(root, path); } - btrfs_release_path(root, path); + btrfs_free_path(path); if (nlink != inode->i_nlink) { inode->i_nlink = nlink; btrfs_update_inode(trans, root, inode); } BTRFS_I(inode)->index_cnt = (u64)-1; - if (inode->i_nlink == 0 && S_ISDIR(inode->i_mode)) { - ret = replay_dir_deletes(trans, root, NULL, path, - inode->i_ino, 1); - BUG_ON(ret); - } - btrfs_free_path(path); - return 0; } @@ -1042,12 +971,9 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, iput(inode); - /* - * fixup on a directory may create new entries, - * make sure we always look for the highset possible - * offset - */ - key.offset = (u64)-1; + if (key.offset == 0) + break; + key.offset--; } btrfs_release_path(root, path); return 0; @@ -1387,11 +1313,11 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, read_extent_buffer(eb, name, (unsigned long)(di + 1), name_len); log_di = NULL; - if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) { + if (dir_key->type == BTRFS_DIR_ITEM_KEY) { log_di = btrfs_lookup_dir_item(trans, log, log_path, dir_key->objectid, name, name_len, 0); - } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) { + } else if (dir_key->type == BTRFS_DIR_INDEX_KEY) { log_di = btrfs_lookup_dir_index_item(trans, log, log_path, dir_key->objectid, @@ -1452,7 +1378,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_root *log, struct btrfs_path *path, - u64 dirid, int del_all) + u64 dirid) { u64 range_start; u64 range_end; @@ -1482,14 +1408,10 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, range_start = 0; range_end = 0; while (1) { - if (del_all) - range_end = (u64)-1; - else { - ret = find_dir_range(log, path, dirid, key_type, - &range_start, &range_end); - if (ret != 0) - break; - } + ret = find_dir_range(log, path, dirid, key_type, + &range_start, &range_end); + if (ret != 0) + break; dir_key.offset = range_start; while (1) { @@ -1515,8 +1437,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, break; ret = check_item_in_log(trans, root, log, path, - log_path, dir, - &found_key); + log_path, dir, &found_key); BUG_ON(ret); if (found_key.offset == (u64)-1) break; @@ -1593,7 +1514,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, mode = btrfs_inode_mode(eb, inode_item); if (S_ISDIR(mode)) { ret = replay_dir_deletes(wc->trans, - root, log, path, key.objectid, 0); + root, log, path, key.objectid); BUG_ON(ret); } ret = overwrite_item(wc->trans, root, path, @@ -1612,17 +1533,6 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, root, inode, inode->i_size, BTRFS_EXTENT_DATA_KEY); BUG_ON(ret); - - /* if the nlink count is zero here, the iput - * will free the inode. We bump it to make - * sure it doesn't get freed until the link - * count fixup is done - */ - if (inode->i_nlink == 0) { - btrfs_inc_nlink(inode); - btrfs_update_inode(wc->trans, - root, inode); - } iput(inode); } ret = link_to_fixup_dir(wc->trans, root, @@ -1930,8 +1840,7 @@ static int update_log_root(struct btrfs_trans_handle *trans, return ret; } -static int wait_log_commit(struct btrfs_trans_handle *trans, - struct btrfs_root *root, unsigned long transid) +static int wait_log_commit(struct btrfs_root *root, unsigned long transid) { DEFINE_WAIT(wait); int index = transid % 2; @@ -1945,12 +1854,9 @@ static int wait_log_commit(struct btrfs_trans_handle *trans, prepare_to_wait(&root->log_commit_wait[index], &wait, TASK_UNINTERRUPTIBLE); mutex_unlock(&root->log_mutex); - - if (root->fs_info->last_trans_log_full_commit != - trans->transid && root->log_transid < transid + 2 && + if (root->log_transid < transid + 2 && atomic_read(&root->log_commit[index])) schedule(); - finish_wait(&root->log_commit_wait[index], &wait); mutex_lock(&root->log_mutex); } while (root->log_transid < transid + 2 && @@ -1958,16 +1864,14 @@ static int wait_log_commit(struct btrfs_trans_handle *trans, return 0; } -static int wait_for_writer(struct btrfs_trans_handle *trans, - struct btrfs_root *root) +static int wait_for_writer(struct btrfs_root *root) { DEFINE_WAIT(wait); while (atomic_read(&root->log_writers)) { prepare_to_wait(&root->log_writer_wait, &wait, TASK_UNINTERRUPTIBLE); mutex_unlock(&root->log_mutex); - if (root->fs_info->last_trans_log_full_commit != - trans->transid && atomic_read(&root->log_writers)) + if (atomic_read(&root->log_writers)) schedule(); mutex_lock(&root->log_mutex); finish_wait(&root->log_writer_wait, &wait); @@ -1978,14 +1882,7 @@ static int wait_for_writer(struct btrfs_trans_handle *trans, /* * btrfs_sync_log does sends a given tree log down to the disk and * updates the super blocks to record it. When this call is done, - * you know that any inodes previously logged are safely on disk only - * if it returns 0. - * - * Any other return value means you need to call btrfs_commit_transaction. - * Some of the edge cases for fsyncing directories that have had unlinks - * or renames done in the past mean that sometimes the only safe - * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, - * that has happened. + * you know that any inodes previously logged are safely on disk */ int btrfs_sync_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) @@ -1999,7 +1896,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, mutex_lock(&root->log_mutex); index1 = root->log_transid % 2; if (atomic_read(&root->log_commit[index1])) { - wait_log_commit(trans, root, root->log_transid); + wait_log_commit(root, root->log_transid); mutex_unlock(&root->log_mutex); return 0; } @@ -2007,26 +1904,18 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, /* wait for previous tree log sync to complete */ if (atomic_read(&root->log_commit[(index1 + 1) % 2])) - wait_log_commit(trans, root, root->log_transid - 1); + wait_log_commit(root, root->log_transid - 1); while (1) { unsigned long batch = root->log_batch; mutex_unlock(&root->log_mutex); schedule_timeout_uninterruptible(1); mutex_lock(&root->log_mutex); - - wait_for_writer(trans, root); + wait_for_writer(root); if (batch == root->log_batch) break; } - /* bail out if we need to do a full commit */ - if (root->fs_info->last_trans_log_full_commit == trans->transid) { - ret = -EAGAIN; - mutex_unlock(&root->log_mutex); - goto out; - } - ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages); BUG_ON(ret); @@ -2062,29 +1951,16 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, index2 = log_root_tree->log_transid % 2; if (atomic_read(&log_root_tree->log_commit[index2])) { - wait_log_commit(trans, log_root_tree, - log_root_tree->log_transid); + wait_log_commit(log_root_tree, log_root_tree->log_transid); mutex_unlock(&log_root_tree->log_mutex); goto out; } atomic_set(&log_root_tree->log_commit[index2], 1); - if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { - wait_log_commit(trans, log_root_tree, - log_root_tree->log_transid - 1); - } - - wait_for_writer(trans, log_root_tree); + if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) + wait_log_commit(log_root_tree, log_root_tree->log_transid - 1); - /* - * now that we've moved on to the tree of log tree roots, - * check the full commit flag again - */ - if (root->fs_info->last_trans_log_full_commit == trans->transid) { - mutex_unlock(&log_root_tree->log_mutex); - ret = -EAGAIN; - goto out_wake_log_root; - } + wait_for_writer(log_root_tree); ret = btrfs_write_and_wait_marked_extents(log_root_tree, &log_root_tree->dirty_log_pages); @@ -2109,9 +1985,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * in and cause problems either. */ write_ctree_super(trans, root->fs_info->tree_root, 2); - ret = 0; -out_wake_log_root: atomic_set(&log_root_tree->log_commit[index2], 0); smp_mb(); if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) @@ -2124,8 +1998,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, return 0; } -/* - * free all the extents used by the tree log. This should be called +/* * free all the extents used by the tree log. This should be called * at commit time of the full transaction */ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) @@ -2259,7 +2132,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, btrfs_free_path(path); mutex_unlock(&BTRFS_I(dir)->log_mutex); - btrfs_end_log_trans(root); + end_log_trans(root); return 0; } @@ -2286,7 +2159,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino, dirid, &index); mutex_unlock(&BTRFS_I(inode)->log_mutex); - btrfs_end_log_trans(root); + end_log_trans(root); return ret; } @@ -2686,7 +2559,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, * * This handles both files and directories. */ -static int btrfs_log_inode(struct btrfs_trans_handle *trans, +static int __btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, int inode_only) { @@ -2712,17 +2585,28 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, min_key.offset = 0; max_key.objectid = inode->i_ino; - - /* today the code can only do partial logging of directories */ - if (!S_ISDIR(inode->i_mode)) - inode_only = LOG_INODE_ALL; - if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode)) max_key.type = BTRFS_XATTR_ITEM_KEY; else max_key.type = (u8)-1; max_key.offset = (u64)-1; + /* + * if this inode has already been logged and we're in inode_only + * mode, we don't want to delete the things that have already + * been written to the log. + * + * But, if the inode has been through an inode_only log, + * the logged_trans field is not set. This allows us to catch + * any new names for this inode in the backrefs by logging it + * again + */ + if (inode_only == LOG_INODE_EXISTS && + BTRFS_I(inode)->logged_trans == trans->transid) { + btrfs_free_path(path); + btrfs_free_path(dst_path); + goto out; + } mutex_lock(&BTRFS_I(inode)->log_mutex); /* @@ -2809,6 +2693,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { btrfs_release_path(root, path); btrfs_release_path(log, dst_path); + BTRFS_I(inode)->log_dirty_trans = 0; ret = log_directory_changes(trans, root, inode, path, dst_path); BUG_ON(ret); } @@ -2817,69 +2702,19 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, btrfs_free_path(path); btrfs_free_path(dst_path); +out: return 0; } -/* - * follow the dentry parent pointers up the chain and see if any - * of the directories in it require a full commit before they can - * be logged. Returns zero if nothing special needs to be done or 1 if - * a full commit is required. - */ -static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, - struct inode *inode, - struct dentry *parent, - struct super_block *sb, - u64 last_committed) +int btrfs_log_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + int inode_only) { - int ret = 0; - struct btrfs_root *root; - - /* - * for regular files, if its inode is already on disk, we don't - * have to worry about the parents at all. This is because - * we can use the last_unlink_trans field to record renames - * and other fun in this file. - */ - if (S_ISREG(inode->i_mode) && - BTRFS_I(inode)->generation <= last_committed && - BTRFS_I(inode)->last_unlink_trans <= last_committed) - goto out; - - if (!S_ISDIR(inode->i_mode)) { - if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) - goto out; - inode = parent->d_inode; - } - - while (1) { - BTRFS_I(inode)->logged_trans = trans->transid; - smp_mb(); - - if (BTRFS_I(inode)->last_unlink_trans > last_committed) { - root = BTRFS_I(inode)->root; - - /* - * make sure any commits to the log are forced - * to be full commits - */ - root->fs_info->last_trans_log_full_commit = - trans->transid; - ret = 1; - break; - } - - if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) - break; - - if (parent == sb->s_root) - break; - - parent = parent->d_parent; - inode = parent->d_inode; + int ret; - } -out: + start_log_trans(trans, root); + ret = __btrfs_log_inode(trans, root, inode, inode_only); + end_log_trans(root); return ret; } @@ -2889,65 +2724,31 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, * only logging is done of any parent directories that are older than * the last committed transaction */ -int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, - struct dentry *parent, int exists_only) +int btrfs_log_dentry(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct dentry *dentry) { - int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; + int inode_only = LOG_INODE_ALL; struct super_block *sb; - int ret = 0; - u64 last_committed = root->fs_info->last_trans_committed; - - sb = inode->i_sb; - - if (root->fs_info->last_trans_log_full_commit > - root->fs_info->last_trans_committed) { - ret = 1; - goto end_no_trans; - } - - ret = check_parent_dirs_for_sync(trans, inode, parent, - sb, last_committed); - if (ret) - goto end_no_trans; + int ret; start_log_trans(trans, root); - - ret = btrfs_log_inode(trans, root, inode, inode_only); - BUG_ON(ret); - - /* - * for regular files, if its inode is already on disk, we don't - * have to worry about the parents at all. This is because - * we can use the last_unlink_trans field to record renames - * and other fun in this file. - */ - if (S_ISREG(inode->i_mode) && - BTRFS_I(inode)->generation <= last_committed && - BTRFS_I(inode)->last_unlink_trans <= last_committed) - goto no_parent; - - inode_only = LOG_INODE_EXISTS; + sb = dentry->d_inode->i_sb; while (1) { - if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) - break; + ret = __btrfs_log_inode(trans, root, dentry->d_inode, + inode_only); + BUG_ON(ret); + inode_only = LOG_INODE_EXISTS; - inode = parent->d_inode; - if (BTRFS_I(inode)->generation > - root->fs_info->last_trans_committed) { - ret = btrfs_log_inode(trans, root, inode, inode_only); - BUG_ON(ret); - } - if (parent == sb->s_root) + dentry = dentry->d_parent; + if (!dentry || !dentry->d_inode || sb != dentry->d_inode->i_sb) break; - parent = parent->d_parent; + if (BTRFS_I(dentry->d_inode)->generation <= + root->fs_info->last_trans_committed) + break; } -no_parent: - ret = 0; - btrfs_end_log_trans(root); -end_no_trans: - return ret; + end_log_trans(root); + return 0; } /* @@ -2959,8 +2760,12 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct dentry *dentry) { - return btrfs_log_inode_parent(trans, root, dentry->d_inode, - dentry->d_parent, 0); + u64 gen; + gen = root->fs_info->last_trans_new_blockgroup; + if (gen > root->fs_info->last_trans_committed) + return 1; + else + return btrfs_log_dentry(trans, root, dentry); } /* @@ -3079,94 +2884,3 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) kfree(log_root_tree); return 0; } - -/* - * there are some corner cases where we want to force a full - * commit instead of allowing a directory to be logged. - * - * They revolve around files there were unlinked from the directory, and - * this function updates the parent directory so that a full commit is - * properly done if it is fsync'd later after the unlinks are done. - */ -void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, - struct inode *dir, struct inode *inode, - int for_rename) -{ - /* - * when we're logging a file, if it hasn't been renamed - * or unlinked, and its inode is fully committed on disk, - * we don't have to worry about walking up the directory chain - * to log its parents. - * - * So, we use the last_unlink_trans field to put this transid - * into the file. When the file is logged we check it and - * don't log the parents if the file is fully on disk. - */ - if (S_ISREG(inode->i_mode)) - BTRFS_I(inode)->last_unlink_trans = trans->transid; - - /* - * if this directory was already logged any new - * names for this file/dir will get recorded - */ - smp_mb(); - if (BTRFS_I(dir)->logged_trans == trans->transid) - return; - - /* - * if the inode we're about to unlink was logged, - * the log will be properly updated for any new names - */ - if (BTRFS_I(inode)->logged_trans == trans->transid) - return; - - /* - * when renaming files across directories, if the directory - * there we're unlinking from gets fsync'd later on, there's - * no way to find the destination directory later and fsync it - * properly. So, we have to be conservative and force commits - * so the new name gets discovered. - */ - if (for_rename) - goto record; - - /* we can safely do the unlink without any special recording */ - return; - -record: - BTRFS_I(dir)->last_unlink_trans = trans->transid; -} - -/* - * Call this after adding a new name for a file and it will properly - * update the log to reflect the new name. - * - * It will return zero if all goes well, and it will return 1 if a - * full transaction commit is required. - */ -int btrfs_log_new_name(struct btrfs_trans_handle *trans, - struct inode *inode, struct inode *old_dir, - struct dentry *parent) -{ - struct btrfs_root * root = BTRFS_I(inode)->root; - - /* - * this will force the logging code to walk the dentry chain - * up for the file - */ - if (S_ISREG(inode->i_mode)) - BTRFS_I(inode)->last_unlink_trans = trans->transid; - - /* - * if this inode hasn't been logged and directory we're renaming it - * from hasn't been logged, we don't need to log it - */ - if (BTRFS_I(inode)->logged_trans <= - root->fs_info->last_trans_committed && - (!old_dir || BTRFS_I(old_dir)->logged_trans <= - root->fs_info->last_trans_committed)) - return 0; - - return btrfs_log_inode_parent(trans, root, inode, parent, 1); -} - diff --git a/trunk/fs/btrfs/tree-log.h b/trunk/fs/btrfs/tree-log.h index d09c7609e16b..b9409b32ed02 100644 --- a/trunk/fs/btrfs/tree-log.h +++ b/trunk/fs/btrfs/tree-log.h @@ -22,9 +22,14 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); +int btrfs_log_dentry(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct dentry *dentry); int btrfs_recover_log_trees(struct btrfs_root *tree_root); int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct dentry *dentry); +int btrfs_log_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + int inode_only); int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, @@ -33,16 +38,4 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, struct inode *inode, u64 dirid); -int btrfs_join_running_log_trans(struct btrfs_root *root); -int btrfs_end_log_trans(struct btrfs_root *root); -int btrfs_pin_log_trans(struct btrfs_root *root); -int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, - struct dentry *parent, int exists_only); -void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, - struct inode *dir, struct inode *inode, - int for_rename); -int btrfs_log_new_name(struct btrfs_trans_handle *trans, - struct inode *inode, struct inode *old_dir, - struct dentry *parent); #endif diff --git a/trunk/fs/ext4/balloc.c b/trunk/fs/ext4/balloc.c index 53c72ad85877..38f40d55899c 100644 --- a/trunk/fs/ext4/balloc.c +++ b/trunk/fs/ext4/balloc.c @@ -55,8 +55,7 @@ static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block, } static int ext4_group_used_meta_blocks(struct super_block *sb, - ext4_group_t block_group, - struct ext4_group_desc *gdp) + ext4_group_t block_group) { ext4_fsblk_t tmp; struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -64,6 +63,10 @@ static int ext4_group_used_meta_blocks(struct super_block *sb, int used_blocks = sbi->s_itb_per_group + 2; if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) { + struct ext4_group_desc *gdp; + struct buffer_head *bh; + + gdp = ext4_get_group_desc(sb, block_group, &bh); if (!ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), block_group)) used_blocks--; @@ -174,7 +177,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, */ mark_bitmap_end(group_blocks, sb->s_blocksize * 8, bh->b_data); } - return free_blocks - ext4_group_used_meta_blocks(sb, block_group, gdp); + return free_blocks - ext4_group_used_meta_blocks(sb, block_group); } @@ -470,8 +473,9 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, block_group); - atomic_add(blocks_freed, - &sbi->s_flex_groups[flex_group].free_blocks); + spin_lock(sb_bgl_lock(sbi, flex_group)); + sbi->s_flex_groups[flex_group].free_blocks += blocks_freed; + spin_unlock(sb_bgl_lock(sbi, flex_group)); } /* * request to reload the buddy with the diff --git a/trunk/fs/ext4/dir.c b/trunk/fs/ext4/dir.c index b64789929a65..2df2e40b01af 100644 --- a/trunk/fs/ext4/dir.c +++ b/trunk/fs/ext4/dir.c @@ -67,8 +67,7 @@ int ext4_check_dir_entry(const char *function, struct inode *dir, unsigned int offset) { const char *error_msg = NULL; - const int rlen = ext4_rec_len_from_disk(de->rec_len, - dir->i_sb->s_blocksize); + const int rlen = ext4_rec_len_from_disk(de->rec_len); if (rlen < EXT4_DIR_REC_LEN(1)) error_msg = "rec_len is smaller than minimal"; @@ -179,11 +178,10 @@ static int ext4_readdir(struct file *filp, * least that it is non-zero. A * failure will be detected in the * dirent test below. */ - if (ext4_rec_len_from_disk(de->rec_len, - sb->s_blocksize) < EXT4_DIR_REC_LEN(1)) + if (ext4_rec_len_from_disk(de->rec_len) + < EXT4_DIR_REC_LEN(1)) break; - i += ext4_rec_len_from_disk(de->rec_len, - sb->s_blocksize); + i += ext4_rec_len_from_disk(de->rec_len); } offset = i; filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1)) @@ -205,8 +203,7 @@ static int ext4_readdir(struct file *filp, ret = stored; goto out; } - offset += ext4_rec_len_from_disk(de->rec_len, - sb->s_blocksize); + offset += ext4_rec_len_from_disk(de->rec_len); if (le32_to_cpu(de->inode)) { /* We might block in the next section * if the data destination is @@ -228,8 +225,7 @@ static int ext4_readdir(struct file *filp, goto revalidate; stored++; } - filp->f_pos += ext4_rec_len_from_disk(de->rec_len, - sb->s_blocksize); + filp->f_pos += ext4_rec_len_from_disk(de->rec_len); } offset = 0; brelse(bh); diff --git a/trunk/fs/ext4/ext4.h b/trunk/fs/ext4/ext4.h index d0f15ef56de1..990c94000924 100644 --- a/trunk/fs/ext4/ext4.h +++ b/trunk/fs/ext4/ext4.h @@ -32,6 +32,14 @@ */ #undef EXT4FS_DEBUG +/* + * Define EXT4_RESERVATION to reserve data blocks for expanding files + */ +#define EXT4_DEFAULT_RESERVE_BLOCKS 8 +/*max window size: 1024(direct blocks) + 3([t,d]indirect blocks) */ +#define EXT4_MAX_RESERVE_BLOCKS 1027 +#define EXT4_RESERVE_WINDOW_NOT_ALLOCATED 0 + /* * Debug code */ @@ -46,6 +54,8 @@ #define ext4_debug(f, a...) do {} while (0) #endif +#define EXT4_MULTIBLOCK_ALLOCATOR 1 + /* prefer goal again. length */ #define EXT4_MB_HINT_MERGE 1 /* blocks already reserved */ @@ -170,9 +180,8 @@ struct ext4_group_desc */ struct flex_groups { - atomic_t free_inodes; - atomic_t free_blocks; - atomic_t used_dirs; + __u32 free_inodes; + __u32 free_blocks; }; #define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */ @@ -240,30 +249,6 @@ struct flex_groups { #define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ #define EXT4_FL_USER_MODIFIABLE 0x000B80FF /* User modifiable flags */ -/* Flags that should be inherited by new inodes from their parent. */ -#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ - EXT4_SYNC_FL | EXT4_IMMUTABLE_FL | EXT4_APPEND_FL |\ - EXT4_NODUMP_FL | EXT4_NOATIME_FL |\ - EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\ - EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL) - -/* Flags that are appropriate for regular files (all but dir-specific ones). */ -#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL)) - -/* Flags that are appropriate for non-directories/regular files. */ -#define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL) - -/* Mask out flags that are inappropriate for the given type of inode. */ -static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags) -{ - if (S_ISDIR(mode)) - return flags; - else if (S_ISREG(mode)) - return flags & EXT4_REG_FLMASK; - else - return flags & EXT4_OTHER_FLMASK; -} - /* * Inode dynamic state flags */ @@ -271,7 +256,6 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags) #define EXT4_STATE_NEW 0x00000002 /* inode is newly created */ #define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */ #define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */ -#define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */ /* Used to pass group descriptor data when online resize is done */ struct ext4_new_group_input { @@ -319,9 +303,7 @@ struct ext4_new_group_data { #define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long) #define EXT4_IOC_GROUP_ADD _IOW('f', 8, struct ext4_new_group_input) #define EXT4_IOC_MIGRATE _IO('f', 9) - /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */ /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */ -#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12) /* * ioctl commands in 32 bit emulation @@ -549,7 +531,7 @@ do { \ #define EXT4_MOUNT_NO_UID32 0x02000 /* Disable 32-bit UIDs */ #define EXT4_MOUNT_XATTR_USER 0x04000 /* Extended user attributes */ #define EXT4_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ -#define EXT4_MOUNT_NO_AUTO_DA_ALLOC 0x10000 /* No auto delalloc mapping */ +#define EXT4_MOUNT_RESERVATION 0x10000 /* Preallocation */ #define EXT4_MOUNT_BARRIER 0x20000 /* Use block barriers */ #define EXT4_MOUNT_NOBH 0x40000 /* No bufferheads */ #define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ @@ -684,8 +666,7 @@ struct ext4_super_block { __u8 s_log_groups_per_flex; /* FLEX_BG group size */ __u8 s_reserved_char_pad2; __le16 s_reserved_pad; - __le64 s_kbytes_written; /* nr of lifetime kilobytes written */ - __u32 s_reserved[160]; /* Padding to the end of the block */ + __u32 s_reserved[162]; /* Padding to the end of the block */ }; #ifdef __KERNEL__ @@ -832,12 +813,6 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) #define EXT4_DEF_MIN_BATCH_TIME 0 #define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */ -/* - * Minimum number of groups in a flexgroup before we separate out - * directories into the first block group of a flexgroup - */ -#define EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME 4 - /* * Structure of a directory entry */ @@ -890,6 +865,24 @@ struct ext4_dir_entry_2 { ~EXT4_DIR_ROUND) #define EXT4_MAX_REC_LEN ((1<<16)-1) +static inline unsigned ext4_rec_len_from_disk(__le16 dlen) +{ + unsigned len = le16_to_cpu(dlen); + + if (len == EXT4_MAX_REC_LEN || len == 0) + return 1 << 16; + return len; +} + +static inline __le16 ext4_rec_len_to_disk(unsigned len) +{ + if (len == (1 << 16)) + return cpu_to_le16(EXT4_MAX_REC_LEN); + else if (len > (1 << 16)) + BUG(); + return cpu_to_le16(len); +} + /* * Hash Tree Directory indexing * (c) Daniel Phillips, 2001 @@ -977,6 +970,22 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, extern struct proc_dir_entry *ext4_proc_root; +#ifdef CONFIG_PROC_FS +extern const struct file_operations ext4_ui_proc_fops; + +#define EXT4_PROC_HANDLER(name, var) \ +do { \ + proc = proc_create_data(name, mode, sbi->s_proc, \ + &ext4_ui_proc_fops, &sbi->s_##var); \ + if (proc == NULL) { \ + printk(KERN_ERR "EXT4-fs: can't create %s\n", name); \ + goto err_out; \ + } \ +} while (0) +#else +#define EXT4_PROC_HANDLER(name, var) +#endif + /* * Function prototypes */ @@ -1083,7 +1092,6 @@ extern int ext4_can_truncate(struct inode *inode); extern void ext4_truncate(struct inode *); extern void ext4_set_inode_flags(struct inode *); extern void ext4_get_inode_flags(struct ext4_inode_info *); -extern int ext4_alloc_da_blocks(struct inode *inode); extern void ext4_set_aops(struct inode *inode); extern int ext4_writepage_trans_blocks(struct inode *); extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks); @@ -1099,10 +1107,7 @@ extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); /* migrate.c */ extern int ext4_ext_migrate(struct inode *); - /* namei.c */ -extern unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize); -extern __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize); extern int ext4_orphan_add(handle_t *, struct inode *); extern int ext4_orphan_del(handle_t *, struct inode *); extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, diff --git a/trunk/fs/ext4/ext4_extents.h b/trunk/fs/ext4/ext4_extents.h index f0c3ec85bd48..18cb67b2cbbc 100644 --- a/trunk/fs/ext4/ext4_extents.h +++ b/trunk/fs/ext4/ext4_extents.h @@ -241,6 +241,5 @@ extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *, extern int ext4_ext_search_right(struct inode *, struct ext4_ext_path *, ext4_lblk_t *, ext4_fsblk_t *); extern void ext4_ext_drop_refs(struct ext4_ext_path *); -extern int ext4_ext_check_inode(struct inode *inode); #endif /* _EXT4_EXTENTS */ diff --git a/trunk/fs/ext4/ext4_i.h b/trunk/fs/ext4/ext4_i.h index 4ce2187123aa..e69acc16f5c4 100644 --- a/trunk/fs/ext4/ext4_i.h +++ b/trunk/fs/ext4/ext4_i.h @@ -33,6 +33,9 @@ typedef __u32 ext4_lblk_t; /* data type for block group number */ typedef unsigned int ext4_group_t; +#define rsv_start rsv_window._rsv_start +#define rsv_end rsv_window._rsv_end + /* * storage for cached extent */ @@ -122,9 +125,6 @@ struct ext4_inode_info { struct list_head i_prealloc_list; spinlock_t i_prealloc_lock; - /* ialloc */ - ext4_group_t i_last_alloc_group; - /* allocation reservation info for delalloc */ unsigned int i_reserved_data_blocks; unsigned int i_reserved_meta_blocks; diff --git a/trunk/fs/ext4/ext4_sb.h b/trunk/fs/ext4/ext4_sb.h index 57b71fefbccf..039b6ea1a042 100644 --- a/trunk/fs/ext4/ext4_sb.h +++ b/trunk/fs/ext4/ext4_sb.h @@ -62,10 +62,12 @@ struct ext4_sb_info { struct percpu_counter s_freeinodes_counter; struct percpu_counter s_dirs_counter; struct percpu_counter s_dirtyblocks_counter; - struct blockgroup_lock *s_blockgroup_lock; + struct blockgroup_lock s_blockgroup_lock; struct proc_dir_entry *s_proc; - struct kobject s_kobj; - struct completion s_kobj_unregister; + + /* root of the per fs reservation window tree */ + spinlock_t s_rsv_window_lock; + struct rb_root s_rsv_window_root; /* Journaling */ struct inode *s_journal_inode; @@ -144,10 +146,6 @@ struct ext4_sb_info { /* locality groups */ struct ext4_locality_group *s_locality_groups; - /* for write statistics */ - unsigned long s_sectors_written_start; - u64 s_kbytes_written; - unsigned int s_log_groups_per_flex; struct flex_groups *s_flex_groups; }; @@ -155,7 +153,7 @@ struct ext4_sb_info { static inline spinlock_t * sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group) { - return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group); + return bgl_lock_ptr(&sbi->s_blockgroup_lock, block_group); } #endif /* _EXT4_SB */ diff --git a/trunk/fs/ext4/extents.c b/trunk/fs/ext4/extents.c index ac77d8b8251d..e0aa4fe4f596 100644 --- a/trunk/fs/ext4/extents.c +++ b/trunk/fs/ext4/extents.c @@ -152,8 +152,6 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, ext4_fsblk_t bg_start; ext4_fsblk_t last_block; ext4_grpblk_t colour; - ext4_group_t block_group; - int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); int depth; if (path) { @@ -172,31 +170,10 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, } /* OK. use inode's group */ - block_group = ei->i_block_group; - if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { - /* - * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME - * block groups per flexgroup, reserve the first block - * group for directories and special files. Regular - * files will start at the second block group. This - * tends to speed up directory access and improves - * fsck times. - */ - block_group &= ~(flex_size-1); - if (S_ISREG(inode->i_mode)) - block_group++; - } - bg_start = (block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) + + bg_start = (ei->i_block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) + le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block); last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; - /* - * If we are doing delayed allocation, we don't need take - * colour into account. - */ - if (test_opt(inode->i_sb, DELALLOC)) - return bg_start; - if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) colour = (current->pid % 16) * (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); @@ -324,64 +301,7 @@ ext4_ext_max_entries(struct inode *inode, int depth) return max; } -static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) -{ - ext4_fsblk_t block = ext_pblock(ext); - int len = ext4_ext_get_actual_len(ext); - struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; - if (unlikely(block < le32_to_cpu(es->s_first_data_block) || - ((block + len) > ext4_blocks_count(es)))) - return 0; - else - return 1; -} - -static int ext4_valid_extent_idx(struct inode *inode, - struct ext4_extent_idx *ext_idx) -{ - ext4_fsblk_t block = idx_pblock(ext_idx); - struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; - if (unlikely(block < le32_to_cpu(es->s_first_data_block) || - (block > ext4_blocks_count(es)))) - return 0; - else - return 1; -} - -static int ext4_valid_extent_entries(struct inode *inode, - struct ext4_extent_header *eh, - int depth) -{ - struct ext4_extent *ext; - struct ext4_extent_idx *ext_idx; - unsigned short entries; - if (eh->eh_entries == 0) - return 1; - - entries = le16_to_cpu(eh->eh_entries); - - if (depth == 0) { - /* leaf entries */ - ext = EXT_FIRST_EXTENT(eh); - while (entries) { - if (!ext4_valid_extent(inode, ext)) - return 0; - ext++; - entries--; - } - } else { - ext_idx = EXT_FIRST_INDEX(eh); - while (entries) { - if (!ext4_valid_extent_idx(inode, ext_idx)) - return 0; - ext_idx++; - entries--; - } - } - return 1; -} - -static int __ext4_ext_check(const char *function, struct inode *inode, +static int __ext4_ext_check_header(const char *function, struct inode *inode, struct ext4_extent_header *eh, int depth) { @@ -409,15 +329,11 @@ static int __ext4_ext_check(const char *function, struct inode *inode, error_msg = "invalid eh_entries"; goto corrupted; } - if (!ext4_valid_extent_entries(inode, eh, depth)) { - error_msg = "invalid extent entries"; - goto corrupted; - } return 0; corrupted: ext4_error(inode->i_sb, function, - "bad header/extent in inode #%lu: %s - magic %x, " + "bad header in inode #%lu: %s - magic %x, " "entries %u, max %u(%u), depth %u(%u)", inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic), le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), @@ -426,13 +342,8 @@ static int __ext4_ext_check(const char *function, struct inode *inode, return -EIO; } -#define ext4_ext_check(inode, eh, depth) \ - __ext4_ext_check(__func__, inode, eh, depth) - -int ext4_ext_check_inode(struct inode *inode) -{ - return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode)); -} +#define ext4_ext_check_header(inode, eh, depth) \ + __ext4_ext_check_header(__func__, inode, eh, depth) #ifdef EXT_DEBUG static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) @@ -636,6 +547,9 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, eh = ext_inode_hdr(inode); depth = ext_depth(inode); + if (ext4_ext_check_header(inode, eh, depth)) + return ERR_PTR(-EIO); + /* account possible depth increase */ if (!path) { @@ -651,8 +565,6 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, i = depth; /* walk through the tree */ while (i) { - int need_to_validate = 0; - ext_debug("depth %d: num %d, max %d\n", ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); @@ -661,17 +573,10 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, path[ppos].p_depth = i; path[ppos].p_ext = NULL; - bh = sb_getblk(inode->i_sb, path[ppos].p_block); - if (unlikely(!bh)) + bh = sb_bread(inode->i_sb, path[ppos].p_block); + if (!bh) goto err; - if (!bh_uptodate_or_lock(bh)) { - if (bh_submit_read(bh) < 0) { - put_bh(bh); - goto err; - } - /* validate the extent entries */ - need_to_validate = 1; - } + eh = ext_block_hdr(bh); ppos++; BUG_ON(ppos > depth); @@ -679,7 +584,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, path[ppos].p_hdr = eh; i--; - if (need_to_validate && ext4_ext_check(inode, eh, i)) + if (ext4_ext_check_header(inode, eh, i)) goto err; } @@ -1276,7 +1181,7 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path, return -EIO; eh = ext_block_hdr(bh); /* subtract from p_depth to get proper eh_depth */ - if (ext4_ext_check(inode, eh, path->p_depth - depth)) { + if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) { put_bh(bh); return -EIO; } @@ -1289,7 +1194,7 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path, if (bh == NULL) return -EIO; eh = ext_block_hdr(bh); - if (ext4_ext_check(inode, eh, path->p_depth - depth)) { + if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) { put_bh(bh); return -EIO; } @@ -2232,7 +2137,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) return -ENOMEM; } path[0].p_hdr = ext_inode_hdr(inode); - if (ext4_ext_check(inode, path[0].p_hdr, depth)) { + if (ext4_ext_check_header(inode, path[0].p_hdr, depth)) { err = -EIO; goto out; } @@ -2286,7 +2191,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) err = -EIO; break; } - if (ext4_ext_check(inode, ext_block_hdr(bh), + if (ext4_ext_check_header(inode, ext_block_hdr(bh), depth - i - 1)) { err = -EIO; break; diff --git a/trunk/fs/ext4/file.c b/trunk/fs/ext4/file.c index 588af8c77246..f731cb545a03 100644 --- a/trunk/fs/ext4/file.c +++ b/trunk/fs/ext4/file.c @@ -33,14 +33,9 @@ */ static int ext4_release_file(struct inode *inode, struct file *filp) { - if (EXT4_I(inode)->i_state & EXT4_STATE_DA_ALLOC_CLOSE) { - ext4_alloc_da_blocks(inode); - EXT4_I(inode)->i_state &= ~EXT4_STATE_DA_ALLOC_CLOSE; - } /* if we are the last writer on the inode, drop the block reservation */ if ((filp->f_mode & FMODE_WRITE) && - (atomic_read(&inode->i_writecount) == 1) && - !EXT4_I(inode)->i_reserved_data_blocks) + (atomic_read(&inode->i_writecount) == 1)) { down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); diff --git a/trunk/fs/ext4/ialloc.c b/trunk/fs/ext4/ialloc.c index 47b84e8df568..fb51b40e3e8f 100644 --- a/trunk/fs/ext4/ialloc.c +++ b/trunk/fs/ext4/ialloc.c @@ -189,6 +189,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) struct ext4_super_block *es; struct ext4_sb_info *sbi; int fatal = 0, err, count, cleared; + ext4_group_t flex_group; if (atomic_read(&inode->i_count) > 1) { printk(KERN_ERR "ext4_free_inode: inode has count=%d\n", @@ -267,13 +268,6 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) if (is_directory) { count = ext4_used_dirs_count(sb, gdp) - 1; ext4_used_dirs_set(sb, gdp, count); - if (sbi->s_log_groups_per_flex) { - ext4_group_t f; - - f = ext4_flex_group(sbi, block_group); - atomic_dec(&sbi->s_flex_groups[f].free_inodes); - } - } gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); @@ -283,10 +277,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) percpu_counter_dec(&sbi->s_dirs_counter); if (sbi->s_log_groups_per_flex) { - ext4_group_t f; - - f = ext4_flex_group(sbi, block_group); - atomic_inc(&sbi->s_flex_groups[f].free_inodes); + flex_group = ext4_flex_group(sbi, block_group); + spin_lock(sb_bgl_lock(sbi, flex_group)); + sbi->s_flex_groups[flex_group].free_inodes++; + spin_unlock(sb_bgl_lock(sbi, flex_group)); } } BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata"); @@ -366,9 +360,9 @@ static int find_group_flex(struct super_block *sb, struct inode *parent, sbi->s_log_groups_per_flex; find_close_to_parent: - flexbg_free_blocks = atomic_read(&flex_group[best_flex].free_blocks); + flexbg_free_blocks = flex_group[best_flex].free_blocks; flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex; - if (atomic_read(&flex_group[best_flex].free_inodes) && + if (flex_group[best_flex].free_inodes && flex_freeb_ratio > free_block_ratio) goto found_flexbg; @@ -381,24 +375,24 @@ static int find_group_flex(struct super_block *sb, struct inode *parent, if (i == parent_fbg_group || i == parent_fbg_group - 1) continue; - flexbg_free_blocks = atomic_read(&flex_group[i].free_blocks); + flexbg_free_blocks = flex_group[i].free_blocks; flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex; if (flex_freeb_ratio > free_block_ratio && - (atomic_read(&flex_group[i].free_inodes))) { + flex_group[i].free_inodes) { best_flex = i; goto found_flexbg; } - if ((atomic_read(&flex_group[best_flex].free_inodes) == 0) || - ((atomic_read(&flex_group[i].free_blocks) > - atomic_read(&flex_group[best_flex].free_blocks)) && - atomic_read(&flex_group[i].free_inodes))) + if (flex_group[best_flex].free_inodes == 0 || + (flex_group[i].free_blocks > + flex_group[best_flex].free_blocks && + flex_group[i].free_inodes)) best_flex = i; } - if (!atomic_read(&flex_group[best_flex].free_inodes) || - !atomic_read(&flex_group[best_flex].free_blocks)) + if (!flex_group[best_flex].free_inodes || + !flex_group[best_flex].free_blocks) return -1; found_flexbg: @@ -416,42 +410,6 @@ static int find_group_flex(struct super_block *sb, struct inode *parent, return 0; } -struct orlov_stats { - __u32 free_inodes; - __u32 free_blocks; - __u32 used_dirs; -}; - -/* - * Helper function for Orlov's allocator; returns critical information - * for a particular block group or flex_bg. If flex_size is 1, then g - * is a block group number; otherwise it is flex_bg number. - */ -void get_orlov_stats(struct super_block *sb, ext4_group_t g, - int flex_size, struct orlov_stats *stats) -{ - struct ext4_group_desc *desc; - struct flex_groups *flex_group = EXT4_SB(sb)->s_flex_groups; - - if (flex_size > 1) { - stats->free_inodes = atomic_read(&flex_group[g].free_inodes); - stats->free_blocks = atomic_read(&flex_group[g].free_blocks); - stats->used_dirs = atomic_read(&flex_group[g].used_dirs); - return; - } - - desc = ext4_get_group_desc(sb, g, NULL); - if (desc) { - stats->free_inodes = ext4_free_inodes_count(sb, desc); - stats->free_blocks = ext4_free_blks_count(sb, desc); - stats->used_dirs = ext4_used_dirs_count(sb, desc); - } else { - stats->free_inodes = 0; - stats->free_blocks = 0; - stats->used_dirs = 0; - } -} - /* * Orlov's allocator for directories. * @@ -467,34 +425,35 @@ void get_orlov_stats(struct super_block *sb, ext4_group_t g, * it has too many directories already (max_dirs) or * it has too few free inodes left (min_inodes) or * it has too few free blocks left (min_blocks) or + * it's already running too large debt (max_debt). * Parent's group is preferred, if it doesn't satisfy these * conditions we search cyclically through the rest. If none * of the groups look good we just look for a group with more * free inodes than average (starting at parent's group). + * + * Debt is incremented each time we allocate a directory and decremented + * when we allocate an inode, within 0--255. */ +#define INODE_COST 64 +#define BLOCK_COST 256 + static int find_group_orlov(struct super_block *sb, struct inode *parent, - ext4_group_t *group, int mode) + ext4_group_t *group) { ext4_group_t parent_group = EXT4_I(parent)->i_block_group; struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; ext4_group_t ngroups = sbi->s_groups_count; int inodes_per_group = EXT4_INODES_PER_GROUP(sb); unsigned int freei, avefreei; ext4_fsblk_t freeb, avefreeb; + ext4_fsblk_t blocks_per_dir; unsigned int ndirs; - int max_dirs, min_inodes; + int max_debt, max_dirs, min_inodes; ext4_grpblk_t min_blocks; - ext4_group_t i, grp, g; + ext4_group_t i; struct ext4_group_desc *desc; - struct orlov_stats stats; - int flex_size = ext4_flex_bg_size(sbi); - - if (flex_size > 1) { - ngroups = (ngroups + flex_size - 1) >> - sbi->s_log_groups_per_flex; - parent_group >>= sbi->s_log_groups_per_flex; - } freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter); avefreei = freei / ngroups; @@ -503,97 +462,71 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, do_div(avefreeb, ngroups); ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter); - if (S_ISDIR(mode) && - ((parent == sb->s_root->d_inode) || - (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL))) { + if ((parent == sb->s_root->d_inode) || + (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL)) { int best_ndir = inodes_per_group; + ext4_group_t grp; int ret = -1; get_random_bytes(&grp, sizeof(grp)); parent_group = (unsigned)grp % ngroups; for (i = 0; i < ngroups; i++) { - g = (parent_group + i) % ngroups; - get_orlov_stats(sb, g, flex_size, &stats); - if (!stats.free_inodes) + grp = (parent_group + i) % ngroups; + desc = ext4_get_group_desc(sb, grp, NULL); + if (!desc || !ext4_free_inodes_count(sb, desc)) continue; - if (stats.used_dirs >= best_ndir) + if (ext4_used_dirs_count(sb, desc) >= best_ndir) continue; - if (stats.free_inodes < avefreei) + if (ext4_free_inodes_count(sb, desc) < avefreei) continue; - if (stats.free_blocks < avefreeb) + if (ext4_free_blks_count(sb, desc) < avefreeb) continue; - grp = g; - ret = 0; - best_ndir = stats.used_dirs; - } - if (ret) - goto fallback; - found_flex_bg: - if (flex_size == 1) { *group = grp; - return 0; - } - - /* - * We pack inodes at the beginning of the flexgroup's - * inode tables. Block allocation decisions will do - * something similar, although regular files will - * start at 2nd block group of the flexgroup. See - * ext4_ext_find_goal() and ext4_find_near(). - */ - grp *= flex_size; - for (i = 0; i < flex_size; i++) { - if (grp+i >= sbi->s_groups_count) - break; - desc = ext4_get_group_desc(sb, grp+i, NULL); - if (desc && ext4_free_inodes_count(sb, desc)) { - *group = grp+i; - return 0; - } + ret = 0; + best_ndir = ext4_used_dirs_count(sb, desc); } + if (ret == 0) + return ret; goto fallback; } - max_dirs = ndirs / ngroups + inodes_per_group / 16; - min_inodes = avefreei - inodes_per_group*flex_size / 4; - if (min_inodes < 1) - min_inodes = 1; - min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb)*flex_size / 4; + blocks_per_dir = ext4_blocks_count(es) - freeb; + do_div(blocks_per_dir, ndirs); - /* - * Start looking in the flex group where we last allocated an - * inode for this parent directory - */ - if (EXT4_I(parent)->i_last_alloc_group != ~0) { - parent_group = EXT4_I(parent)->i_last_alloc_group; - if (flex_size > 1) - parent_group >>= sbi->s_log_groups_per_flex; - } + max_dirs = ndirs / ngroups + inodes_per_group / 16; + min_inodes = avefreei - inodes_per_group / 4; + min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb) / 4; + + max_debt = EXT4_BLOCKS_PER_GROUP(sb); + max_debt /= max_t(int, blocks_per_dir, BLOCK_COST); + if (max_debt * INODE_COST > inodes_per_group) + max_debt = inodes_per_group / INODE_COST; + if (max_debt > 255) + max_debt = 255; + if (max_debt == 0) + max_debt = 1; for (i = 0; i < ngroups; i++) { - grp = (parent_group + i) % ngroups; - get_orlov_stats(sb, grp, flex_size, &stats); - if (stats.used_dirs >= max_dirs) + *group = (parent_group + i) % ngroups; + desc = ext4_get_group_desc(sb, *group, NULL); + if (!desc || !ext4_free_inodes_count(sb, desc)) + continue; + if (ext4_used_dirs_count(sb, desc) >= max_dirs) continue; - if (stats.free_inodes < min_inodes) + if (ext4_free_inodes_count(sb, desc) < min_inodes) continue; - if (stats.free_blocks < min_blocks) + if (ext4_free_blks_count(sb, desc) < min_blocks) continue; - goto found_flex_bg; + return 0; } fallback: - ngroups = sbi->s_groups_count; - avefreei = freei / ngroups; - parent_group = EXT4_I(parent)->i_block_group; for (i = 0; i < ngroups; i++) { - grp = (parent_group + i) % ngroups; - desc = ext4_get_group_desc(sb, grp, NULL); + *group = (parent_group + i) % ngroups; + desc = ext4_get_group_desc(sb, *group, NULL); if (desc && ext4_free_inodes_count(sb, desc) && - ext4_free_inodes_count(sb, desc) >= avefreei) { - *group = grp; + ext4_free_inodes_count(sb, desc) >= avefreei) return 0; - } } if (avefreei) { @@ -609,51 +542,12 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, } static int find_group_other(struct super_block *sb, struct inode *parent, - ext4_group_t *group, int mode) + ext4_group_t *group) { ext4_group_t parent_group = EXT4_I(parent)->i_block_group; ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; struct ext4_group_desc *desc; - ext4_group_t i, last; - int flex_size = ext4_flex_bg_size(EXT4_SB(sb)); - - /* - * Try to place the inode is the same flex group as its - * parent. If we can't find space, use the Orlov algorithm to - * find another flex group, and store that information in the - * parent directory's inode information so that use that flex - * group for future allocations. - */ - if (flex_size > 1) { - int retry = 0; - - try_again: - parent_group &= ~(flex_size-1); - last = parent_group + flex_size; - if (last > ngroups) - last = ngroups; - for (i = parent_group; i < last; i++) { - desc = ext4_get_group_desc(sb, i, NULL); - if (desc && ext4_free_inodes_count(sb, desc)) { - *group = i; - return 0; - } - } - if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) { - retry = 1; - parent_group = EXT4_I(parent)->i_last_alloc_group; - goto try_again; - } - /* - * If this didn't work, use the Orlov search algorithm - * to find a new flex group; we pass in the mode to - * avoid the topdir algorithms. - */ - *group = parent_group + flex_size; - if (*group > ngroups) - *group = 0; - return find_group_orlov(sb, parent, group, mode); - } + ext4_group_t i; /* * Try to place the inode in its parent directory @@ -771,11 +665,6 @@ static int ext4_claim_inode(struct super_block *sb, if (S_ISDIR(mode)) { count = ext4_used_dirs_count(sb, gdp) + 1; ext4_used_dirs_set(sb, gdp, count); - if (sbi->s_log_groups_per_flex) { - ext4_group_t f = ext4_flex_group(sbi, group); - - atomic_inc(&sbi->s_flex_groups[f].free_inodes); - } } gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); err_ret: @@ -827,10 +716,10 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) sbi = EXT4_SB(sb); es = sbi->s_es; - if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) { + if (sbi->s_log_groups_per_flex) { ret2 = find_group_flex(sb, dir, &group); if (ret2 == -1) { - ret2 = find_group_other(sb, dir, &group, mode); + ret2 = find_group_other(sb, dir, &group); if (ret2 == 0 && once) once = 0; printk(KERN_NOTICE "ext4: find_group_flex " @@ -844,12 +733,11 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) if (test_opt(sb, OLDALLOC)) ret2 = find_group_dir(sb, dir, &group); else - ret2 = find_group_orlov(sb, dir, &group, mode); + ret2 = find_group_orlov(sb, dir, &group); } else - ret2 = find_group_other(sb, dir, &group, mode); + ret2 = find_group_other(sb, dir, &group); got_group: - EXT4_I(dir)->i_last_alloc_group = group; err = -ENOSPC; if (ret2 == -1) goto out; @@ -970,7 +858,9 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) if (sbi->s_log_groups_per_flex) { flex_group = ext4_flex_group(sbi, group); - atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes); + spin_lock(sb_bgl_lock(sbi, flex_group)); + sbi->s_flex_groups[flex_group].free_inodes--; + spin_unlock(sb_bgl_lock(sbi, flex_group)); } inode->i_uid = current_fsuid(); @@ -995,16 +885,19 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) ei->i_disksize = 0; /* - * Don't inherit extent flag from directory, amongst others. We set - * extent flag on newly created directory and file only if -o extent - * mount option is specified + * Don't inherit extent flag from directory. We set extent flag on + * newly created directory and file only if -o extent mount option is + * specified */ - ei->i_flags = - ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED); + ei->i_flags = EXT4_I(dir)->i_flags & ~(EXT4_INDEX_FL|EXT4_EXTENTS_FL); + if (S_ISLNK(mode)) + ei->i_flags &= ~(EXT4_IMMUTABLE_FL|EXT4_APPEND_FL); + /* dirsync only applies to directories */ + if (!S_ISDIR(mode)) + ei->i_flags &= ~EXT4_DIRSYNC_FL; ei->i_file_acl = 0; ei->i_dtime = 0; ei->i_block_group = group; - ei->i_last_alloc_group = ~0; ext4_set_inode_flags(inode); if (IS_DIRSYNC(inode)) diff --git a/trunk/fs/ext4/inode.c b/trunk/fs/ext4/inode.c index a2e7952bc5f9..dd82ff390067 100644 --- a/trunk/fs/ext4/inode.c +++ b/trunk/fs/ext4/inode.c @@ -371,34 +371,6 @@ static int ext4_block_to_path(struct inode *inode, return n; } -static int __ext4_check_blockref(const char *function, struct inode *inode, - unsigned int *p, unsigned int max) { - - unsigned int maxblocks = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es); - unsigned int *bref = p; - while (bref < p+max) { - if (unlikely(*bref >= maxblocks)) { - ext4_error(inode->i_sb, function, - "block reference %u >= max (%u) " - "in inode #%lu, offset=%d", - *bref, maxblocks, - inode->i_ino, (int)(bref-p)); - return -EIO; - } - bref++; - } - return 0; -} - - -#define ext4_check_indirect_blockref(inode, bh) \ - __ext4_check_blockref(__func__, inode, (__le32 *)(bh)->b_data, \ - EXT4_ADDR_PER_BLOCK((inode)->i_sb)) - -#define ext4_check_inode_blockref(inode) \ - __ext4_check_blockref(__func__, inode, EXT4_I(inode)->i_data, \ - EXT4_NDIR_BLOCKS) - /** * ext4_get_branch - read the chain of indirect blocks leading to data * @inode: inode in question @@ -443,22 +415,9 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth, if (!p->key) goto no_block; while (--depth) { - bh = sb_getblk(sb, le32_to_cpu(p->key)); - if (unlikely(!bh)) + bh = sb_bread(sb, le32_to_cpu(p->key)); + if (!bh) goto failure; - - if (!bh_uptodate_or_lock(bh)) { - if (bh_submit_read(bh) < 0) { - put_bh(bh); - goto failure; - } - /* validate block references */ - if (ext4_check_indirect_blockref(inode, bh)) { - put_bh(bh); - goto failure; - } - } - add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets); /* Reader: end */ if (!p->key) @@ -500,8 +459,6 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) ext4_fsblk_t bg_start; ext4_fsblk_t last_block; ext4_grpblk_t colour; - ext4_group_t block_group; - int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); /* Try to find previous block */ for (p = ind->p - 1; p >= start; p--) { @@ -517,22 +474,9 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) * It is going to be referred to from the inode itself? OK, just put it * into the same cylinder group then. */ - block_group = ei->i_block_group; - if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { - block_group &= ~(flex_size-1); - if (S_ISREG(inode->i_mode)) - block_group++; - } - bg_start = ext4_group_first_block_no(inode->i_sb, block_group); + bg_start = ext4_group_first_block_no(inode->i_sb, ei->i_block_group); last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; - /* - * If we are doing delayed allocation, we don't need take - * colour into account. - */ - if (test_opt(inode->i_sb, DELALLOC)) - return bg_start; - if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) colour = (current->pid % 16) * (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); @@ -1108,16 +1052,9 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used) /* * free those over-booking quota for metadata blocks */ + if (mdb_free) vfs_dq_release_reservation_block(inode, mdb_free); - - /* - * If we have done all the pending block allocations and if - * there aren't any writers on the inode, we can discard the - * inode's preallocations. - */ - if (!total && (atomic_read(&inode->i_writecount) == 0)) - ext4_discard_preallocations(inode); } /* @@ -1751,10 +1688,9 @@ static void ext4_da_page_release_reservation(struct page *page, struct mpage_da_data { struct inode *inode; - sector_t b_blocknr; /* start block number of extent */ - size_t b_size; /* size of extent */ - unsigned long b_state; /* state of the extent */ + struct buffer_head lbh; /* extent of blocks */ unsigned long first_page, next_page; /* extent of pages */ + get_block_t *get_block; struct writeback_control *wbc; int io_done; int pages_written; @@ -1768,6 +1704,7 @@ struct mpage_da_data { * @mpd->inode: inode * @mpd->first_page: first page of the extent * @mpd->next_page: page after the last page of the extent + * @mpd->get_block: the filesystem's block mapper function * * By the time mpage_da_submit_io() is called we expect all blocks * to be allocated. this may be wrong if allocation failed. @@ -1787,7 +1724,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) /* * We need to start from the first_page to the next_page - 1 * to make sure we also write the mapped dirty buffer_heads. - * If we look at mpd->b_blocknr we would only be looking + * If we look at mpd->lbh.b_blocknr we would only be looking * at the currently mapped buffer_heads. */ index = mpd->first_page; @@ -1977,111 +1914,68 @@ static void ext4_print_free_blocks(struct inode *inode) return; } -#define EXT4_DELALLOC_RSVED 1 -static int ext4_da_get_block_write(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) -{ - int ret; - unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; - loff_t disksize = EXT4_I(inode)->i_disksize; - handle_t *handle = NULL; - - handle = ext4_journal_current_handle(); - BUG_ON(!handle); - ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks, - bh_result, create, 0, EXT4_DELALLOC_RSVED); - if (ret <= 0) - return ret; - - bh_result->b_size = (ret << inode->i_blkbits); - - if (ext4_should_order_data(inode)) { - int retval; - retval = ext4_jbd2_file_inode(handle, inode); - if (retval) - /* - * Failed to add inode for ordered mode. Don't - * update file size - */ - return retval; - } - - /* - * Update on-disk size along with block allocation we don't - * use 'extend_disksize' as size may change within already - * allocated block -bzzz - */ - disksize = ((loff_t) iblock + ret) << inode->i_blkbits; - if (disksize > i_size_read(inode)) - disksize = i_size_read(inode); - if (disksize > EXT4_I(inode)->i_disksize) { - ext4_update_i_disksize(inode, disksize); - ret = ext4_mark_inode_dirty(handle, inode); - return ret; - } - return 0; -} - /* * mpage_da_map_blocks - go through given space * - * @mpd - bh describing space + * @mpd->lbh - bh describing space + * @mpd->get_block - the filesystem's block mapper function * * The function skips space we know is already mapped to disk blocks. * */ -static int mpage_da_map_blocks(struct mpage_da_data *mpd) +static int mpage_da_map_blocks(struct mpage_da_data *mpd) { int err = 0; struct buffer_head new; + struct buffer_head *lbh = &mpd->lbh; sector_t next; /* * We consider only non-mapped and non-allocated blocks */ - if ((mpd->b_state & (1 << BH_Mapped)) && - !(mpd->b_state & (1 << BH_Delay))) + if (buffer_mapped(lbh) && !buffer_delay(lbh)) return 0; - new.b_state = mpd->b_state; + new.b_state = lbh->b_state; new.b_blocknr = 0; - new.b_size = mpd->b_size; - next = mpd->b_blocknr; + new.b_size = lbh->b_size; + next = lbh->b_blocknr; /* * If we didn't accumulate anything * to write simply return */ if (!new.b_size) return 0; - - err = ext4_da_get_block_write(mpd->inode, next, &new, 1); + err = mpd->get_block(mpd->inode, next, &new, 1); if (err) { - /* - * If get block returns with error we simply - * return. Later writepage will redirty the page and - * writepages will find the dirty page again + + /* If get block returns with error + * we simply return. Later writepage + * will redirty the page and writepages + * will find the dirty page again */ if (err == -EAGAIN) return 0; if (err == -ENOSPC && - ext4_count_free_blocks(mpd->inode->i_sb)) { + ext4_count_free_blocks(mpd->inode->i_sb)) { mpd->retval = err; return 0; } /* - * get block failure will cause us to loop in - * writepages, because a_ops->writepage won't be able - * to make progress. The page will be redirtied by - * writepage and writepages will again try to write - * the same. + * get block failure will cause us + * to loop in writepages. Because + * a_ops->writepage won't be able to + * make progress. The page will be redirtied + * by writepage and writepages will again + * try to write the same. */ printk(KERN_EMERG "%s block allocation failed for inode %lu " "at logical offset %llu with max blocks " "%zd with error %d\n", __func__, mpd->inode->i_ino, (unsigned long long)next, - mpd->b_size >> mpd->inode->i_blkbits, err); + lbh->b_size >> mpd->inode->i_blkbits, err); printk(KERN_EMERG "This should not happen.!! " "Data will be lost\n"); if (err == -ENOSPC) { @@ -2089,7 +1983,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) } /* invlaidate all the pages */ ext4_da_block_invalidatepages(mpd, next, - mpd->b_size >> mpd->inode->i_blkbits); + lbh->b_size >> mpd->inode->i_blkbits); return err; } BUG_ON(new.b_size == 0); @@ -2101,8 +1995,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) * If blocks are delayed marked, we need to * put actual blocknr and drop delayed bit */ - if ((mpd->b_state & (1 << BH_Delay)) || - (mpd->b_state & (1 << BH_Unwritten))) + if (buffer_delay(lbh) || buffer_unwritten(lbh)) mpage_put_bnr_to_bhs(mpd, next, &new); return 0; @@ -2121,11 +2014,12 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) * the function is used to collect contig. blocks in same state */ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, - sector_t logical, size_t b_size, - unsigned long b_state) + sector_t logical, struct buffer_head *bh) { sector_t next; - int nrblocks = mpd->b_size >> mpd->inode->i_blkbits; + size_t b_size = bh->b_size; + struct buffer_head *lbh = &mpd->lbh; + int nrblocks = lbh->b_size >> mpd->inode->i_blkbits; /* check if thereserved journal credits might overflow */ if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) { @@ -2152,19 +2046,19 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, /* * First block in the extent */ - if (mpd->b_size == 0) { - mpd->b_blocknr = logical; - mpd->b_size = b_size; - mpd->b_state = b_state & BH_FLAGS; + if (lbh->b_size == 0) { + lbh->b_blocknr = logical; + lbh->b_size = b_size; + lbh->b_state = bh->b_state & BH_FLAGS; return; } - next = mpd->b_blocknr + nrblocks; + next = lbh->b_blocknr + nrblocks; /* * Can we merge the block to our big extent? */ - if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) { - mpd->b_size += b_size; + if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) { + lbh->b_size += b_size; return; } @@ -2193,7 +2087,7 @@ static int __mpage_da_writepage(struct page *page, { struct mpage_da_data *mpd = data; struct inode *inode = mpd->inode; - struct buffer_head *bh, *head; + struct buffer_head *bh, *head, fake; sector_t logical; if (mpd->io_done) { @@ -2235,9 +2129,9 @@ static int __mpage_da_writepage(struct page *page, /* * ... and blocks */ - mpd->b_size = 0; - mpd->b_state = 0; - mpd->b_blocknr = 0; + mpd->lbh.b_size = 0; + mpd->lbh.b_state = 0; + mpd->lbh.b_blocknr = 0; } mpd->next_page = page->index + 1; @@ -2245,8 +2139,16 @@ static int __mpage_da_writepage(struct page *page, (PAGE_CACHE_SHIFT - inode->i_blkbits); if (!page_has_buffers(page)) { - mpage_add_bh_to_extent(mpd, logical, PAGE_CACHE_SIZE, - (1 << BH_Dirty) | (1 << BH_Uptodate)); + /* + * There is no attached buffer heads yet (mmap?) + * we treat the page asfull of dirty blocks + */ + bh = &fake; + bh->b_size = PAGE_CACHE_SIZE; + bh->b_state = 0; + set_buffer_dirty(bh); + set_buffer_uptodate(bh); + mpage_add_bh_to_extent(mpd, logical, bh); if (mpd->io_done) return MPAGE_DA_EXTENT_TAIL; } else { @@ -2264,10 +2166,8 @@ static int __mpage_da_writepage(struct page *page, * with the page in ext4_da_writepage */ if (buffer_dirty(bh) && - (!buffer_mapped(bh) || buffer_delay(bh))) { - mpage_add_bh_to_extent(mpd, logical, - bh->b_size, - bh->b_state); + (!buffer_mapped(bh) || buffer_delay(bh))) { + mpage_add_bh_to_extent(mpd, logical, bh); if (mpd->io_done) return MPAGE_DA_EXTENT_TAIL; } else if (buffer_dirty(bh) && (buffer_mapped(bh))) { @@ -2279,8 +2179,9 @@ static int __mpage_da_writepage(struct page *page, * unmapped buffer_head later we need to * use the b_state flag of that buffer_head. */ - if (mpd->b_size == 0) - mpd->b_state = bh->b_state & BH_FLAGS; + if (mpd->lbh.b_size == 0) + mpd->lbh.b_state = + bh->b_state & BH_FLAGS; } logical++; } while ((bh = bh->b_this_page) != head); @@ -2289,6 +2190,51 @@ static int __mpage_da_writepage(struct page *page, return 0; } +/* + * mpage_da_writepages - walk the list of dirty pages of the given + * address space, allocates non-allocated blocks, maps newly-allocated + * blocks to existing bhs and issue IO them + * + * @mapping: address space structure to write + * @wbc: subtract the number of written pages from *@wbc->nr_to_write + * @get_block: the filesystem's block mapper function. + * + * This is a library function, which implements the writepages() + * address_space_operation. + */ +static int mpage_da_writepages(struct address_space *mapping, + struct writeback_control *wbc, + struct mpage_da_data *mpd) +{ + int ret; + + if (!mpd->get_block) + return generic_writepages(mapping, wbc); + + mpd->lbh.b_size = 0; + mpd->lbh.b_state = 0; + mpd->lbh.b_blocknr = 0; + mpd->first_page = 0; + mpd->next_page = 0; + mpd->io_done = 0; + mpd->pages_written = 0; + mpd->retval = 0; + + ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd); + /* + * Handle last extent of pages + */ + if (!mpd->io_done && mpd->next_page != mpd->first_page) { + if (mpage_da_map_blocks(mpd) == 0) + mpage_da_submit_io(mpd); + + mpd->io_done = 1; + ret = MPAGE_DA_EXTENT_TAIL; + } + wbc->nr_to_write -= mpd->pages_written; + return ret; +} + /* * this is a special callback for ->write_begin() only * it's intention is to return mapped block or reserve space @@ -2328,6 +2274,51 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, return ret; } +#define EXT4_DELALLOC_RSVED 1 +static int ext4_da_get_block_write(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + int ret; + unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; + loff_t disksize = EXT4_I(inode)->i_disksize; + handle_t *handle = NULL; + + handle = ext4_journal_current_handle(); + BUG_ON(!handle); + ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks, + bh_result, create, 0, EXT4_DELALLOC_RSVED); + if (ret > 0) { + + bh_result->b_size = (ret << inode->i_blkbits); + + if (ext4_should_order_data(inode)) { + int retval; + retval = ext4_jbd2_file_inode(handle, inode); + if (retval) + /* + * Failed to add inode for ordered + * mode. Don't update file size + */ + return retval; + } + + /* + * Update on-disk size along with block allocation + * we don't use 'extend_disksize' as size may change + * within already allocated block -bzzz + */ + disksize = ((loff_t) iblock + ret) << inode->i_blkbits; + if (disksize > i_size_read(inode)) + disksize = i_size_read(inode); + if (disksize > EXT4_I(inode)->i_disksize) { + ext4_update_i_disksize(inode, disksize); + ret = ext4_mark_inode_dirty(handle, inode); + return ret; + } + ret = 0; + } + return ret; +} static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh) { @@ -2578,38 +2569,8 @@ static int ext4_da_writepages(struct address_space *mapping, dump_stack(); goto out_writepages; } - - /* - * Now call __mpage_da_writepage to find the next - * contiguous region of logical blocks that need - * blocks to be allocated by ext4. We don't actually - * submit the blocks for I/O here, even though - * write_cache_pages thinks it will, and will set the - * pages as clean for write before calling - * __mpage_da_writepage(). - */ - mpd.b_size = 0; - mpd.b_state = 0; - mpd.b_blocknr = 0; - mpd.first_page = 0; - mpd.next_page = 0; - mpd.io_done = 0; - mpd.pages_written = 0; - mpd.retval = 0; - ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, - &mpd); - /* - * If we have a contigous extent of pages and we - * haven't done the I/O yet, map the blocks and submit - * them for I/O. - */ - if (!mpd.io_done && mpd.next_page != mpd.first_page) { - if (mpage_da_map_blocks(&mpd) == 0) - mpage_da_submit_io(&mpd); - mpd.io_done = 1; - ret = MPAGE_DA_EXTENT_TAIL; - } - wbc->nr_to_write -= mpd.pages_written; + mpd.get_block = ext4_da_get_block_write; + ret = mpage_da_writepages(mapping, wbc, &mpd); ext4_journal_stop(handle); @@ -2885,48 +2846,6 @@ static void ext4_da_invalidatepage(struct page *page, unsigned long offset) return; } -/* - * Force all delayed allocation blocks to be allocated for a given inode. - */ -int ext4_alloc_da_blocks(struct inode *inode) -{ - if (!EXT4_I(inode)->i_reserved_data_blocks && - !EXT4_I(inode)->i_reserved_meta_blocks) - return 0; - - /* - * We do something simple for now. The filemap_flush() will - * also start triggering a write of the data blocks, which is - * not strictly speaking necessary (and for users of - * laptop_mode, not even desirable). However, to do otherwise - * would require replicating code paths in: - * - * ext4_da_writepages() -> - * write_cache_pages() ---> (via passed in callback function) - * __mpage_da_writepage() --> - * mpage_add_bh_to_extent() - * mpage_da_map_blocks() - * - * The problem is that write_cache_pages(), located in - * mm/page-writeback.c, marks pages clean in preparation for - * doing I/O, which is not desirable if we're not planning on - * doing I/O at all. - * - * We could call write_cache_pages(), and then redirty all of - * the pages by calling redirty_page_for_writeback() but that - * would be ugly in the extreme. So instead we would need to - * replicate parts of the code in the above functions, - * simplifying them becuase we wouldn't actually intend to - * write out the pages, but rather only collect contiguous - * logical block extents, call the multi-block allocator, and - * then update the buffer heads with the block allocations. - * - * For now, though, we'll cheat by calling filemap_flush(), - * which will map the blocks, and start the I/O, but not - * actually wait for the I/O to complete. - */ - return filemap_flush(inode->i_mapping); -} /* * bmap() is special. It gets used by applications such as lilo and by @@ -3949,9 +3868,6 @@ void ext4_truncate(struct inode *inode) if (!ext4_can_truncate(inode)) return; - if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) - ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE; - if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { ext4_ext_truncate(inode); return; @@ -4194,7 +4110,12 @@ static int __ext4_get_inode_loc(struct inode *inode, unsigned num; table = ext4_inode_table(sb, gdp); - /* s_inode_readahead_blks is always a power of 2 */ + /* Make sure s_inode_readahead_blks is a power of 2 */ + while (EXT4_SB(sb)->s_inode_readahead_blks & + (EXT4_SB(sb)->s_inode_readahead_blks-1)) + EXT4_SB(sb)->s_inode_readahead_blks = + (EXT4_SB(sb)->s_inode_readahead_blks & + (EXT4_SB(sb)->s_inode_readahead_blks-1)); b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1); if (table > b) b = table; @@ -4366,7 +4287,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ei->i_disksize = inode->i_size; inode->i_generation = le32_to_cpu(raw_inode->i_generation); ei->i_block_group = iloc.block_group; - ei->i_last_alloc_group = ~0; /* * NOTE! The in-memory inode i_data array is in little-endian order * even on big-endian machines: we do NOT byteswap the block numbers! @@ -4409,20 +4329,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; } - if (ei->i_flags & EXT4_EXTENTS_FL) { - /* Validate extent which is part of inode */ - ret = ext4_ext_check_inode(inode); - } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || - (S_ISLNK(inode->i_mode) && - !ext4_inode_is_fast_symlink(inode))) { - /* Validate block references which are part of inode */ - ret = ext4_check_inode_blockref(inode); - } - if (ret) { - brelse(bh); - goto bad_inode; - } - if (S_ISREG(inode->i_mode)) { inode->i_op = &ext4_file_inode_operations; inode->i_fop = &ext4_file_operations; @@ -4439,8 +4345,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) inode->i_op = &ext4_symlink_inode_operations; ext4_set_aops(inode); } - } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || - S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { + } else { inode->i_op = &ext4_special_inode_operations; if (raw_inode->i_block[0]) init_special_inode(inode, inode->i_mode, @@ -4448,13 +4353,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) else init_special_inode(inode, inode->i_mode, new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); - } else { - brelse(bh); - ret = -EIO; - ext4_error(inode->i_sb, __func__, - "bogus i_mode (%o) for inode=%lu", - inode->i_mode, inode->i_ino); - goto bad_inode; } brelse(iloc.bh); ext4_set_inode_flags(inode); diff --git a/trunk/fs/ext4/ioctl.c b/trunk/fs/ext4/ioctl.c index 91e75f7a9e73..42dc83fb247a 100644 --- a/trunk/fs/ext4/ioctl.c +++ b/trunk/fs/ext4/ioctl.c @@ -48,7 +48,8 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (err) return err; - flags = ext4_mask_flags(inode->i_mode, flags); + if (!S_ISDIR(inode->i_mode)) + flags &= ~EXT4_DIRSYNC_FL; err = -EPERM; mutex_lock(&inode->i_mutex); @@ -262,20 +263,6 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return err; } - case EXT4_IOC_ALLOC_DA_BLKS: - { - int err; - if (!is_owner_or_cap(inode)) - return -EACCES; - - err = mnt_want_write(filp->f_path.mnt); - if (err) - return err; - err = ext4_alloc_da_blocks(inode); - mnt_drop_write(filp->f_path.mnt); - return err; - } - default: return -ENOTTY; } diff --git a/trunk/fs/ext4/mballoc.c b/trunk/fs/ext4/mballoc.c index f871677a7984..b038188bd039 100644 --- a/trunk/fs/ext4/mballoc.c +++ b/trunk/fs/ext4/mballoc.c @@ -46,23 +46,22 @@ * The allocation request involve request for multiple number of blocks * near to the goal(block) value specified. * - * During initialization phase of the allocator we decide to use the - * group preallocation or inode preallocation depending on the size of - * the file. The size of the file could be the resulting file size we - * would have after allocation, or the current file size, which ever - * is larger. If the size is less than sbi->s_mb_stream_request we - * select to use the group preallocation. The default value of - * s_mb_stream_request is 16 blocks. This can also be tuned via - * /sys/fs/ext4//mb_stream_req. The value is represented in - * terms of number of blocks. + * During initialization phase of the allocator we decide to use the group + * preallocation or inode preallocation depending on the size file. The + * size of the file could be the resulting file size we would have after + * allocation or the current file size which ever is larger. If the size is + * less that sbi->s_mb_stream_request we select the group + * preallocation. The default value of s_mb_stream_request is 16 + * blocks. This can also be tuned via + * /proc/fs/ext4//stream_req. The value is represented in terms + * of number of blocks. * * The main motivation for having small file use group preallocation is to - * ensure that we have small files closer together on the disk. + * ensure that we have small file closer in the disk. * - * First stage the allocator looks at the inode prealloc list, - * ext4_inode_info->i_prealloc_list, which contains list of prealloc - * spaces for this particular inode. The inode prealloc space is - * represented as: + * First stage the allocator looks at the inode prealloc list + * ext4_inode_info->i_prealloc_list contain list of prealloc spaces for + * this particular inode. The inode prealloc space is represented as: * * pa_lstart -> the logical start block for this prealloc space * pa_pstart -> the physical start block for this prealloc space @@ -122,29 +121,29 @@ * list. In case of inode preallocation we follow a list of heuristics * based on file size. This can be found in ext4_mb_normalize_request. If * we are doing a group prealloc we try to normalize the request to - * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is + * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is set to * 512 blocks. This can be tuned via - * /sys/fs/ext4/ option the group prealloc request is normalized to the * stripe value (sbi->s_stripe) * - * The regular allocator(using the buddy cache) supports few tunables. + * The regular allocator(using the buddy cache) support few tunables. * - * /sys/fs/ext4//mb_min_to_scan - * /sys/fs/ext4//mb_max_to_scan - * /sys/fs/ext4//mb_order2_req + * /proc/fs/ext4//min_to_scan + * /proc/fs/ext4//max_to_scan + * /proc/fs/ext4//order2_req * - * The regular allocator uses buddy scan only if the request len is power of + * The regular allocator use buddy scan only if the request len is power of * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The * value of s_mb_order2_reqs can be tuned via - * /sys/fs/ext4//mb_order2_req. If the request len is equal to + * /proc/fs/ext4//order2_req. If the request len is equal to * stripe size (sbi->s_stripe), we try to search for contigous block in - * stripe size. This should result in better allocation on RAID setups. If - * not, we search in the specific group using bitmap for best extents. The - * tunable min_to_scan and max_to_scan control the behaviour here. + * stripe size. This should result in better allocation on RAID setup. If + * not we search in the specific group using bitmap for best extents. The + * tunable min_to_scan and max_to_scan controll the behaviour here. * min_to_scan indicate how long the mballoc __must__ look for a best - * extent and max_to_scan indicates how long the mballoc __can__ look for a + * extent and max_to_scanindicate how long the mballoc __can__ look for a * best extent in the found extents. Searching for the blocks starts with * the group specified as the goal value in allocation context via * ac_g_ex. Each group is first checked based on the criteria whether it @@ -338,6 +337,8 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t group); static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, ext4_group_t group); +static int ext4_mb_init_per_dev_proc(struct super_block *sb); +static int ext4_mb_destroy_per_dev_proc(struct super_block *sb); static void release_blocks_on_commit(journal_t *journal, transaction_t *txn); @@ -1725,7 +1726,6 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac, { unsigned free, fragments; unsigned i, bits; - int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb)); struct ext4_group_desc *desc; struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); @@ -1747,12 +1747,6 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac, if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) return 0; - /* Avoid using the first bg of a flexgroup for data files */ - if ((ac->ac_flags & EXT4_MB_HINT_DATA) && - (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) && - ((group % flex_size) == 0)) - return 0; - bits = ac->ac_sb->s_blocksize_bits + 1; for (i = ac->ac_2order; i <= bits; i++) if (grp->bb_counters[i] > 0) @@ -1977,7 +1971,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) /* * We search using buddy data only if the order of the request * is greater than equal to the sbi_s_mb_order2_reqs - * You can tune it via /sys/fs/ext4//mb_order2_req + * You can tune it via /proc/fs/ext4//order2_req */ if (i >= sbi->s_mb_order2_reqs) { /* @@ -2699,7 +2693,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int); sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); if (sbi->s_mb_maxs == NULL) { - kfree(sbi->s_mb_offsets); + kfree(sbi->s_mb_maxs); return -ENOMEM; } @@ -2752,6 +2746,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) spin_lock_init(&lg->lg_prealloc_lock); } + ext4_mb_init_per_dev_proc(sb); ext4_mb_history_init(sb); if (sbi->s_journal) @@ -2834,6 +2829,7 @@ int ext4_mb_release(struct super_block *sb) free_percpu(sbi->s_locality_groups); ext4_mb_history_release(sb); + ext4_mb_destroy_per_dev_proc(sb); return 0; } @@ -2894,6 +2890,62 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) mb_debug("freed %u blocks in %u structures\n", count, count2); } +#define EXT4_MB_STATS_NAME "stats" +#define EXT4_MB_MAX_TO_SCAN_NAME "max_to_scan" +#define EXT4_MB_MIN_TO_SCAN_NAME "min_to_scan" +#define EXT4_MB_ORDER2_REQ "order2_req" +#define EXT4_MB_STREAM_REQ "stream_req" +#define EXT4_MB_GROUP_PREALLOC "group_prealloc" + +static int ext4_mb_init_per_dev_proc(struct super_block *sb) +{ +#ifdef CONFIG_PROC_FS + mode_t mode = S_IFREG | S_IRUGO | S_IWUSR; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct proc_dir_entry *proc; + + if (sbi->s_proc == NULL) + return -EINVAL; + + EXT4_PROC_HANDLER(EXT4_MB_STATS_NAME, mb_stats); + EXT4_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, mb_max_to_scan); + EXT4_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, mb_min_to_scan); + EXT4_PROC_HANDLER(EXT4_MB_ORDER2_REQ, mb_order2_reqs); + EXT4_PROC_HANDLER(EXT4_MB_STREAM_REQ, mb_stream_request); + EXT4_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, mb_group_prealloc); + return 0; + +err_out: + remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc); + remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc); + remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc); + remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc); + remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc); + remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc); + return -ENOMEM; +#else + return 0; +#endif +} + +static int ext4_mb_destroy_per_dev_proc(struct super_block *sb) +{ +#ifdef CONFIG_PROC_FS + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (sbi->s_proc == NULL) + return -EINVAL; + + remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc); + remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc); + remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc); + remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc); + remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc); + remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc); +#endif + return 0; +} + int __init init_ext4_mballoc(void) { ext4_pspace_cachep = @@ -3044,8 +3096,9 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, ac->ac_b_ex.fe_group); - atomic_sub(ac->ac_b_ex.fe_len, - &sbi->s_flex_groups[flex_group].free_blocks); + spin_lock(sb_bgl_lock(sbi, flex_group)); + sbi->s_flex_groups[flex_group].free_blocks -= ac->ac_b_ex.fe_len; + spin_unlock(sb_bgl_lock(sbi, flex_group)); } err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); @@ -3063,7 +3116,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, * here we normalize request for locality group * Group request are normalized to s_strip size if we set the same via mount * option. If not we set it to s_mb_group_prealloc which can be configured via - * /sys/fs/ext4//mb_group_prealloc + * /proc/fs/ext4//group_prealloc * * XXX: should we try to preallocate more than the group has now? */ @@ -3555,11 +3608,8 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, spin_unlock(&pa->pa_lock); grp_blk = pa->pa_pstart; - /* - * If doing group-based preallocation, pa_pstart may be in the - * next group when pa is used up - */ - if (pa->pa_type == MB_GROUP_PA) + /* If linear, pa_pstart may be in the next group when pa is used up */ + if (pa->pa_linear) grp_blk--; ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL); @@ -3654,7 +3704,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) INIT_LIST_HEAD(&pa->pa_inode_list); INIT_LIST_HEAD(&pa->pa_group_list); pa->pa_deleted = 0; - pa->pa_type = MB_INODE_PA; + pa->pa_linear = 0; mb_debug("new inode pa %p: %llu/%u for %u\n", pa, pa->pa_pstart, pa->pa_len, pa->pa_lstart); @@ -3717,7 +3767,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac) INIT_LIST_HEAD(&pa->pa_inode_list); INIT_LIST_HEAD(&pa->pa_group_list); pa->pa_deleted = 0; - pa->pa_type = MB_GROUP_PA; + pa->pa_linear = 1; mb_debug("new group pa %p: %llu/%u for %u\n", pa, pa->pa_pstart, pa->pa_len, pa->pa_lstart); @@ -3971,7 +4021,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, list_del_rcu(&pa->pa_inode_list); spin_unlock(pa->pa_obj_lock); - if (pa->pa_type == MB_GROUP_PA) + if (pa->pa_linear) ext4_mb_release_group_pa(&e4b, pa, ac); else ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac); @@ -4071,7 +4121,7 @@ void ext4_discard_preallocations(struct inode *inode) spin_unlock(&ei->i_prealloc_lock); list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { - BUG_ON(pa->pa_type != MB_INODE_PA); + BUG_ON(pa->pa_linear != 0); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); err = ext4_mb_load_buddy(sb, group, &e4b); @@ -4182,7 +4232,7 @@ static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac) * file is determined by the current size or the resulting size after * allocation which ever is larger * - * One can tune this size via /sys/fs/ext4//mb_stream_req + * One can tune this size via /proc/fs/ext4//stream_req */ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) { @@ -4323,7 +4373,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, continue; } /* only lg prealloc space */ - BUG_ON(pa->pa_type != MB_GROUP_PA); + BUG_ON(!pa->pa_linear); /* seems this one can be freed ... */ pa->pa_deleted = 1; @@ -4392,7 +4442,7 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) pa_inode_list) { spin_lock(&tmp_pa->pa_lock); if (tmp_pa->pa_deleted) { - spin_unlock(&tmp_pa->pa_lock); + spin_unlock(&pa->pa_lock); continue; } if (!added && pa->pa_free < tmp_pa->pa_free) { @@ -4429,7 +4479,7 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac) { struct ext4_prealloc_space *pa = ac->ac_pa; if (pa) { - if (pa->pa_type == MB_GROUP_PA) { + if (pa->pa_linear) { /* see comment in ext4_mb_use_group_pa() */ spin_lock(&pa->pa_lock); pa->pa_pstart += ac->ac_b_ex.fe_len; @@ -4449,7 +4499,7 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac) * doesn't grow big. We need to release * alloc_semp before calling ext4_mb_add_n_trim() */ - if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) { + if (pa->pa_linear && likely(pa->pa_free)) { spin_lock(pa->pa_obj_lock); list_del_rcu(&pa->pa_inode_list); spin_unlock(pa->pa_obj_lock); @@ -4886,7 +4936,9 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode, if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, block_group); - atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks); + spin_lock(sb_bgl_lock(sbi, flex_group)); + sbi->s_flex_groups[flex_group].free_blocks += count; + spin_unlock(sb_bgl_lock(sbi, flex_group)); } ext4_mb_release_desc(&e4b); diff --git a/trunk/fs/ext4/mballoc.h b/trunk/fs/ext4/mballoc.h index dd9e6cd5f6cf..10a2921baf14 100644 --- a/trunk/fs/ext4/mballoc.h +++ b/trunk/fs/ext4/mballoc.h @@ -132,15 +132,12 @@ struct ext4_prealloc_space { ext4_lblk_t pa_lstart; /* log. block */ unsigned short pa_len; /* len of preallocated chunk */ unsigned short pa_free; /* how many blocks are free */ - unsigned short pa_type; /* pa type. inode or group */ + unsigned short pa_linear; /* consumed in one direction + * strictly, for grp prealloc */ spinlock_t *pa_obj_lock; struct inode *pa_inode; /* hack, for history only */ }; -enum { - MB_INODE_PA = 0, - MB_GROUP_PA = 1 -}; struct ext4_free_extent { ext4_lblk_t fe_logical; @@ -250,6 +247,7 @@ static inline void ext4_mb_store_history(struct ext4_allocation_context *ac) #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) +struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t); static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, struct ext4_free_extent *fex) { diff --git a/trunk/fs/ext4/namei.c b/trunk/fs/ext4/namei.c index 22098e1cd085..83410244d3ee 100644 --- a/trunk/fs/ext4/namei.c +++ b/trunk/fs/ext4/namei.c @@ -161,12 +161,12 @@ static struct dx_frame *dx_probe(const struct qstr *d_name, struct dx_frame *frame, int *err); static void dx_release(struct dx_frame *frames); -static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize, +static int dx_make_map(struct ext4_dir_entry_2 *de, int size, struct dx_hash_info *hinfo, struct dx_map_entry map[]); static void dx_sort_map(struct dx_map_entry *map, unsigned count); static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to, - struct dx_map_entry *offsets, int count, unsigned blocksize); -static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize); + struct dx_map_entry *offsets, int count); +static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size); static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block); static int ext4_htree_next_block(struct inode *dir, __u32 hash, @@ -180,38 +180,14 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, struct inode *inode); -unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize) -{ - unsigned len = le16_to_cpu(dlen); - - if (len == EXT4_MAX_REC_LEN || len == 0) - return blocksize; - return (len & 65532) | ((len & 3) << 16); -} - -__le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) -{ - if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3)) - BUG(); - if (len < 65536) - return cpu_to_le16(len); - if (len == blocksize) { - if (blocksize == 65536) - return cpu_to_le16(EXT4_MAX_REC_LEN); - else - return cpu_to_le16(0); - } - return cpu_to_le16((len & 65532) | ((len >> 16) & 3)); -} - /* * p is at least 6 bytes before the end of page */ static inline struct ext4_dir_entry_2 * -ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize) +ext4_next_entry(struct ext4_dir_entry_2 *p) { return (struct ext4_dir_entry_2 *)((char *)p + - ext4_rec_len_from_disk(p->rec_len, blocksize)); + ext4_rec_len_from_disk(p->rec_len)); } /* @@ -318,7 +294,7 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_ent space += EXT4_DIR_REC_LEN(de->name_len); names++; } - de = ext4_next_entry(de, size); + de = ext4_next_entry(de); } printk("(%i)\n", names); return (struct stats) { names, space, 1 }; @@ -609,7 +585,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, top = (struct ext4_dir_entry_2 *) ((char *) de + dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0)); - for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) { + for (; de < top; de = ext4_next_entry(de)) { if (!ext4_check_dir_entry("htree_dirblock_to_tree", dir, de, bh, (block<i_sb)) +((char *)de - bh->b_data))) { @@ -687,7 +663,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, } if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) { de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data; - de = ext4_next_entry(de, dir->i_sb->s_blocksize); + de = ext4_next_entry(de); if ((err = ext4_htree_store_dirent(dir_file, 2, 0, de)) != 0) goto errout; count++; @@ -737,15 +713,15 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, * Create map of hash values, offsets, and sizes, stored at end of block. * Returns number of entries mapped. */ -static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize, - struct dx_hash_info *hinfo, - struct dx_map_entry *map_tail) +static int dx_make_map (struct ext4_dir_entry_2 *de, int size, + struct dx_hash_info *hinfo, struct dx_map_entry *map_tail) { int count = 0; char *base = (char *) de; struct dx_hash_info h = *hinfo; - while ((char *) de < base + blocksize) { + while ((char *) de < base + size) + { if (de->name_len && de->inode) { ext4fs_dirhash(de->name, de->name_len, &h); map_tail--; @@ -756,7 +732,7 @@ static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize, cond_resched(); } /* XXX: do we need to check rec_len == 0 case? -Chris */ - de = ext4_next_entry(de, blocksize); + de = ext4_next_entry(de); } return count; } @@ -856,8 +832,7 @@ static inline int search_dirblock(struct buffer_head *bh, return 1; } /* prevent looping on a bad block */ - de_len = ext4_rec_len_from_disk(de->rec_len, - dir->i_sb->s_blocksize); + de_len = ext4_rec_len_from_disk(de->rec_len); if (de_len <= 0) return -1; offset += de_len; @@ -1021,7 +996,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q de = (struct ext4_dir_entry_2 *) bh->b_data; top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize - EXT4_DIR_REC_LEN(0)); - for (; de < top; de = ext4_next_entry(de, sb->s_blocksize)) { + for (; de < top; de = ext4_next_entry(de)) { int off = (block << EXT4_BLOCK_SIZE_BITS(sb)) + ((char *) de - bh->b_data); @@ -1077,16 +1052,8 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru return ERR_PTR(-EIO); } inode = ext4_iget(dir->i_sb, ino); - if (unlikely(IS_ERR(inode))) { - if (PTR_ERR(inode) == -ESTALE) { - ext4_error(dir->i_sb, __func__, - "deleted inode referenced: %u", - ino); - return ERR_PTR(-EIO); - } else { - return ERR_CAST(inode); - } - } + if (IS_ERR(inode)) + return ERR_CAST(inode); } return d_splice_alias(inode, dentry); } @@ -1142,8 +1109,7 @@ static inline void ext4_set_de_type(struct super_block *sb, * Returns pointer to last entry moved. */ static struct ext4_dir_entry_2 * -dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count, - unsigned blocksize) +dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count) { unsigned rec_len = 0; @@ -1152,7 +1118,7 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count, rec_len = EXT4_DIR_REC_LEN(de->name_len); memcpy (to, de, rec_len); ((struct ext4_dir_entry_2 *) to)->rec_len = - ext4_rec_len_to_disk(rec_len, blocksize); + ext4_rec_len_to_disk(rec_len); de->inode = 0; map++; to += rec_len; @@ -1164,19 +1130,19 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count, * Compact each dir entry in the range to the minimal rec_len. * Returns pointer to last entry in range. */ -static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize) +static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size) { struct ext4_dir_entry_2 *next, *to, *prev, *de = (struct ext4_dir_entry_2 *) base; unsigned rec_len = 0; prev = to = de; - while ((char*)de < base + blocksize) { - next = ext4_next_entry(de, blocksize); + while ((char*)de < base + size) { + next = ext4_next_entry(de); if (de->inode && de->name_len) { rec_len = EXT4_DIR_REC_LEN(de->name_len); if (de > to) memmove(to, de, rec_len); - to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize); + to->rec_len = ext4_rec_len_to_disk(rec_len); prev = to; to = (struct ext4_dir_entry_2 *) (((char *) to) + rec_len); } @@ -1249,12 +1215,10 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, hash2, split, count-split)); /* Fancy dance to stay within two buffers */ - de2 = dx_move_dirents(data1, data2, map + split, count - split, blocksize); + de2 = dx_move_dirents(data1, data2, map + split, count - split); de = dx_pack_dirents(data1, blocksize); - de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de, - blocksize); - de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2, - blocksize); + de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de); + de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2); dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1)); dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1)); @@ -1304,7 +1268,6 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, const char *name = dentry->d_name.name; int namelen = dentry->d_name.len; unsigned int offset = 0; - unsigned int blocksize = dir->i_sb->s_blocksize; unsigned short reclen; int nlen, rlen, err; char *top; @@ -1312,7 +1275,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, reclen = EXT4_DIR_REC_LEN(namelen); if (!de) { de = (struct ext4_dir_entry_2 *)bh->b_data; - top = bh->b_data + blocksize - reclen; + top = bh->b_data + dir->i_sb->s_blocksize - reclen; while ((char *) de <= top) { if (!ext4_check_dir_entry("ext4_add_entry", dir, de, bh, offset)) { @@ -1324,7 +1287,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, return -EEXIST; } nlen = EXT4_DIR_REC_LEN(de->name_len); - rlen = ext4_rec_len_from_disk(de->rec_len, blocksize); + rlen = ext4_rec_len_from_disk(de->rec_len); if ((de->inode? rlen - nlen: rlen) >= reclen) break; de = (struct ext4_dir_entry_2 *)((char *)de + rlen); @@ -1343,11 +1306,11 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, /* By now the buffer is marked for journaling */ nlen = EXT4_DIR_REC_LEN(de->name_len); - rlen = ext4_rec_len_from_disk(de->rec_len, blocksize); + rlen = ext4_rec_len_from_disk(de->rec_len); if (de->inode) { struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen); - de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, blocksize); - de->rec_len = ext4_rec_len_to_disk(nlen, blocksize); + de1->rec_len = ext4_rec_len_to_disk(rlen - nlen); + de->rec_len = ext4_rec_len_to_disk(nlen); de = de1; } de->file_type = EXT4_FT_UNKNOWN; @@ -1417,7 +1380,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, /* The 0th block becomes the root, move the dirents out */ fde = &root->dotdot; de = (struct ext4_dir_entry_2 *)((char *)fde + - ext4_rec_len_from_disk(fde->rec_len, blocksize)); + ext4_rec_len_from_disk(fde->rec_len)); if ((char *) de >= (((char *) root) + blocksize)) { ext4_error(dir->i_sb, __func__, "invalid rec_len for '..' in inode %lu", @@ -1439,14 +1402,12 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, memcpy (data1, de, len); de = (struct ext4_dir_entry_2 *) data1; top = data1 + len; - while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top) + while ((char *)(de2 = ext4_next_entry(de)) < top) de = de2; - de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de, - blocksize); + de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de); /* Initialize the root; the dot dirents already exist */ de = (struct ext4_dir_entry_2 *) (&root->dotdot); - de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2), - blocksize); + de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2)); memset (&root->info, 0, sizeof(root->info)); root->info.info_length = sizeof(root->info); root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; @@ -1527,7 +1488,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, return retval; de = (struct ext4_dir_entry_2 *) bh->b_data; de->inode = 0; - de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize); + de->rec_len = ext4_rec_len_to_disk(blocksize); return add_dirent_to_buf(handle, dentry, inode, de, bh); } @@ -1590,8 +1551,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, goto cleanup; node2 = (struct dx_node *)(bh2->b_data); entries2 = node2->entries; - node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize, - sb->s_blocksize); + node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize); node2->fake.inode = 0; BUFFER_TRACE(frame->bh, "get_write_access"); err = ext4_journal_get_write_access(handle, frame->bh); @@ -1679,7 +1639,6 @@ static int ext4_delete_entry(handle_t *handle, struct buffer_head *bh) { struct ext4_dir_entry_2 *de, *pde; - unsigned int blocksize = dir->i_sb->s_blocksize; int i; i = 0; @@ -1693,11 +1652,8 @@ static int ext4_delete_entry(handle_t *handle, ext4_journal_get_write_access(handle, bh); if (pde) pde->rec_len = ext4_rec_len_to_disk( - ext4_rec_len_from_disk(pde->rec_len, - blocksize) + - ext4_rec_len_from_disk(de->rec_len, - blocksize), - blocksize); + ext4_rec_len_from_disk(pde->rec_len) + + ext4_rec_len_from_disk(de->rec_len)); else de->inode = 0; dir->i_version++; @@ -1705,9 +1661,9 @@ static int ext4_delete_entry(handle_t *handle, ext4_handle_dirty_metadata(handle, dir, bh); return 0; } - i += ext4_rec_len_from_disk(de->rec_len, blocksize); + i += ext4_rec_len_from_disk(de->rec_len); pde = de; - de = ext4_next_entry(de, blocksize); + de = ext4_next_entry(de); } return -ENOENT; } @@ -1837,7 +1793,6 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode) struct inode *inode; struct buffer_head *dir_block; struct ext4_dir_entry_2 *de; - unsigned int blocksize = dir->i_sb->s_blocksize; int err, retries = 0; if (EXT4_DIR_LINK_MAX(dir)) @@ -1869,14 +1824,13 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode) de = (struct ext4_dir_entry_2 *) dir_block->b_data; de->inode = cpu_to_le32(inode->i_ino); de->name_len = 1; - de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len), - blocksize); + de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len)); strcpy(de->name, "."); ext4_set_de_type(dir->i_sb, de, S_IFDIR); - de = ext4_next_entry(de, blocksize); + de = ext4_next_entry(de); de->inode = cpu_to_le32(dir->i_ino); - de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(1), - blocksize); + de->rec_len = ext4_rec_len_to_disk(inode->i_sb->s_blocksize - + EXT4_DIR_REC_LEN(1)); de->name_len = 2; strcpy(de->name, ".."); ext4_set_de_type(dir->i_sb, de, S_IFDIR); @@ -1931,7 +1885,7 @@ static int empty_dir(struct inode *inode) return 1; } de = (struct ext4_dir_entry_2 *) bh->b_data; - de1 = ext4_next_entry(de, sb->s_blocksize); + de1 = ext4_next_entry(de); if (le32_to_cpu(de->inode) != inode->i_ino || !le32_to_cpu(de1->inode) || strcmp(".", de->name) || @@ -1942,9 +1896,9 @@ static int empty_dir(struct inode *inode) brelse(bh); return 1; } - offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) + - ext4_rec_len_from_disk(de1->rec_len, sb->s_blocksize); - de = ext4_next_entry(de1, sb->s_blocksize); + offset = ext4_rec_len_from_disk(de->rec_len) + + ext4_rec_len_from_disk(de1->rec_len); + de = ext4_next_entry(de1); while (offset < inode->i_size) { if (!bh || (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { @@ -1973,8 +1927,8 @@ static int empty_dir(struct inode *inode) brelse(bh); return 0; } - offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); - de = ext4_next_entry(de, sb->s_blocksize); + offset += ext4_rec_len_from_disk(de->rec_len); + de = ext4_next_entry(de); } brelse(bh); return 1; @@ -2343,8 +2297,8 @@ static int ext4_link(struct dentry *old_dentry, return err; } -#define PARENT_INO(buffer, size) \ - (ext4_next_entry((struct ext4_dir_entry_2 *)(buffer), size)->inode) +#define PARENT_INO(buffer) \ + (ext4_next_entry((struct ext4_dir_entry_2 *)(buffer))->inode) /* * Anybody can rename anything with this: the permission checks are left to the @@ -2357,7 +2311,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *old_inode, *new_inode; struct buffer_head *old_bh, *new_bh, *dir_bh; struct ext4_dir_entry_2 *old_de, *new_de; - int retval, force_da_alloc = 0; + int retval; old_bh = new_bh = dir_bh = NULL; @@ -2404,8 +2358,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval); if (!dir_bh) goto end_rename; - if (le32_to_cpu(PARENT_INO(dir_bh->b_data, - old_dir->i_sb->s_blocksize)) != old_dir->i_ino) + if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino) goto end_rename; retval = -EMLINK; if (!new_inode && new_dir != old_dir && @@ -2477,8 +2430,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, if (dir_bh) { BUFFER_TRACE(dir_bh, "get_write_access"); ext4_journal_get_write_access(handle, dir_bh); - PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) = - cpu_to_le32(new_dir->i_ino); + PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino); BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata"); ext4_handle_dirty_metadata(handle, old_dir, dir_bh); ext4_dec_count(handle, old_dir); @@ -2497,8 +2449,6 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, ext4_mark_inode_dirty(handle, new_inode); if (!new_inode->i_nlink) ext4_orphan_add(handle, new_inode); - if (!test_opt(new_dir->i_sb, NO_AUTO_DA_ALLOC)) - force_da_alloc = 1; } retval = 0; @@ -2507,8 +2457,6 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, brelse(old_bh); brelse(new_bh); ext4_journal_stop(handle); - if (retval == 0 && force_da_alloc) - ext4_alloc_da_blocks(old_inode); return retval; } diff --git a/trunk/fs/ext4/resize.c b/trunk/fs/ext4/resize.c index 546c7dd869e1..c06886abd658 100644 --- a/trunk/fs/ext4/resize.c +++ b/trunk/fs/ext4/resize.c @@ -938,10 +938,10 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) { ext4_group_t flex_group; flex_group = ext4_flex_group(sbi, input->group); - atomic_add(input->free_blocks_count, - &sbi->s_flex_groups[flex_group].free_blocks); - atomic_add(EXT4_INODES_PER_GROUP(sb), - &sbi->s_flex_groups[flex_group].free_inodes); + sbi->s_flex_groups[flex_group].free_blocks += + input->free_blocks_count; + sbi->s_flex_groups[flex_group].free_inodes += + EXT4_INODES_PER_GROUP(sb); } ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); diff --git a/trunk/fs/ext4/super.c b/trunk/fs/ext4/super.c index 9987bba99db3..f7371a6a923d 100644 --- a/trunk/fs/ext4/super.c +++ b/trunk/fs/ext4/super.c @@ -35,7 +35,6 @@ #include #include #include -#include #include #include #include @@ -49,7 +48,6 @@ #include "group.h" struct proc_dir_entry *ext4_proc_root; -static struct kset *ext4_kset; static int ext4_load_journal(struct super_block *, struct ext4_super_block *, unsigned long journal_devnum); @@ -579,9 +577,9 @@ static void ext4_put_super(struct super_block *sb) ext4_commit_super(sb, es, 1); } if (sbi->s_proc) { + remove_proc_entry("inode_readahead_blks", sbi->s_proc); remove_proc_entry(sb->s_id, ext4_proc_root); } - kobject_del(&sbi->s_kobj); for (i = 0; i < sbi->s_gdb_count; i++) brelse(sbi->s_group_desc[i]); @@ -617,17 +615,6 @@ static void ext4_put_super(struct super_block *sb) ext4_blkdev_remove(sbi); } sb->s_fs_info = NULL; - /* - * Now that we are completely done shutting down the - * superblock, we need to actually destroy the kobject. - */ - unlock_kernel(); - unlock_super(sb); - kobject_put(&sbi->s_kobj); - wait_for_completion(&sbi->s_kobj_unregister); - lock_super(sb); - lock_kernel(); - kfree(sbi->s_blockgroup_lock); kfree(sbi); return; } @@ -816,6 +803,8 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL)) seq_puts(seq, ",noacl"); #endif + if (!test_opt(sb, RESERVATION)) + seq_puts(seq, ",noreservation"); if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) { seq_printf(seq, ",commit=%u", (unsigned) (sbi->s_commit_interval / HZ)); @@ -866,9 +855,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) if (test_opt(sb, DATA_ERR_ABORT)) seq_puts(seq, ",data_err=abort"); - if (test_opt(sb, NO_AUTO_DA_ALLOC)) - seq_puts(seq, ",noauto_da_alloc"); - ext4_show_quota_options(seq, sb); return 0; } @@ -1018,7 +1004,7 @@ enum { Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro, Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov, Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, - Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, Opt_nobh, Opt_bh, + Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_update, Opt_journal_dev, Opt_journal_checksum, Opt_journal_async_commit, @@ -1026,8 +1012,8 @@ enum { Opt_data_err_abort, Opt_data_err_ignore, Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, - Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize, - Opt_usrquota, Opt_grpquota, Opt_i_version, + Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, + Opt_grpquota, Opt_i_version, Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_inode_readahead_blks, Opt_journal_ioprio }; @@ -1053,6 +1039,8 @@ static const match_table_t tokens = { {Opt_nouser_xattr, "nouser_xattr"}, {Opt_acl, "acl"}, {Opt_noacl, "noacl"}, + {Opt_reservation, "reservation"}, + {Opt_noreservation, "noreservation"}, {Opt_noload, "noload"}, {Opt_nobh, "nobh"}, {Opt_bh, "bh"}, @@ -1080,8 +1068,6 @@ static const match_table_t tokens = { {Opt_quota, "quota"}, {Opt_usrquota, "usrquota"}, {Opt_barrier, "barrier=%u"}, - {Opt_barrier, "barrier"}, - {Opt_nobarrier, "nobarrier"}, {Opt_i_version, "i_version"}, {Opt_stripe, "stripe=%u"}, {Opt_resize, "resize"}, @@ -1089,9 +1075,6 @@ static const match_table_t tokens = { {Opt_nodelalloc, "nodelalloc"}, {Opt_inode_readahead_blks, "inode_readahead_blks=%u"}, {Opt_journal_ioprio, "journal_ioprio=%u"}, - {Opt_auto_da_alloc, "auto_da_alloc=%u"}, - {Opt_auto_da_alloc, "auto_da_alloc"}, - {Opt_noauto_da_alloc, "noauto_da_alloc"}, {Opt_err, NULL}, }; @@ -1224,6 +1207,12 @@ static int parse_options(char *options, struct super_block *sb, "not supported\n"); break; #endif + case Opt_reservation: + set_opt(sbi->s_mount_opt, RESERVATION); + break; + case Opt_noreservation: + clear_opt(sbi->s_mount_opt, RESERVATION); + break; case Opt_journal_update: /* @@@ FIXME */ /* Eventually we will want to be able to create @@ -1426,14 +1415,9 @@ static int parse_options(char *options, struct super_block *sb, case Opt_abort: set_opt(sbi->s_mount_opt, ABORT); break; - case Opt_nobarrier: - clear_opt(sbi->s_mount_opt, BARRIER); - break; case Opt_barrier: - if (match_int(&args[0], &option)) { - set_opt(sbi->s_mount_opt, BARRIER); - break; - } + if (match_int(&args[0], &option)) + return 0; if (option) set_opt(sbi->s_mount_opt, BARRIER); else @@ -1479,11 +1463,6 @@ static int parse_options(char *options, struct super_block *sb, return 0; if (option < 0 || option > (1 << 30)) return 0; - if (option & (option - 1)) { - printk(KERN_ERR "EXT4-fs: inode_readahead_blks" - " must be a power of 2\n"); - return 0; - } sbi->s_inode_readahead_blks = option; break; case Opt_journal_ioprio: @@ -1494,19 +1473,6 @@ static int parse_options(char *options, struct super_block *sb, *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, option); break; - case Opt_noauto_da_alloc: - set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC); - break; - case Opt_auto_da_alloc: - if (match_int(&args[0], &option)) { - clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC); - break; - } - if (option) - clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC); - else - set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC); - break; default: printk(KERN_ERR "EXT4-fs: Unrecognized mount option \"%s\" " @@ -1646,12 +1612,10 @@ static int ext4_fill_flex_info(struct super_block *sb) gdp = ext4_get_group_desc(sb, i, &bh); flex_group = ext4_flex_group(sbi, i); - atomic_set(&sbi->s_flex_groups[flex_group].free_inodes, - ext4_free_inodes_count(sb, gdp)); - atomic_set(&sbi->s_flex_groups[flex_group].free_blocks, - ext4_free_blks_count(sb, gdp)); - atomic_set(&sbi->s_flex_groups[flex_group].used_dirs, - ext4_used_dirs_count(sb, gdp)); + sbi->s_flex_groups[flex_group].free_inodes += + ext4_free_inodes_count(sb, gdp); + sbi->s_flex_groups[flex_group].free_blocks += + ext4_free_blks_count(sb, gdp); } return 1; @@ -2027,181 +1991,6 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi) return 0; } -/* sysfs supprt */ - -struct ext4_attr { - struct attribute attr; - ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *); - ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *, - const char *, size_t); - int offset; -}; - -static int parse_strtoul(const char *buf, - unsigned long max, unsigned long *value) -{ - char *endp; - - while (*buf && isspace(*buf)) - buf++; - *value = simple_strtoul(buf, &endp, 0); - while (*endp && isspace(*endp)) - endp++; - if (*endp || *value > max) - return -EINVAL; - - return 0; -} - -static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a, - struct ext4_sb_info *sbi, - char *buf) -{ - return snprintf(buf, PAGE_SIZE, "%llu\n", - (s64) percpu_counter_sum(&sbi->s_dirtyblocks_counter)); -} - -static ssize_t session_write_kbytes_show(struct ext4_attr *a, - struct ext4_sb_info *sbi, char *buf) -{ - struct super_block *sb = sbi->s_buddy_cache->i_sb; - - return snprintf(buf, PAGE_SIZE, "%lu\n", - (part_stat_read(sb->s_bdev->bd_part, sectors[1]) - - sbi->s_sectors_written_start) >> 1); -} - -static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a, - struct ext4_sb_info *sbi, char *buf) -{ - struct super_block *sb = sbi->s_buddy_cache->i_sb; - - return snprintf(buf, PAGE_SIZE, "%llu\n", - sbi->s_kbytes_written + - ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - - EXT4_SB(sb)->s_sectors_written_start) >> 1)); -} - -static ssize_t inode_readahead_blks_store(struct ext4_attr *a, - struct ext4_sb_info *sbi, - const char *buf, size_t count) -{ - unsigned long t; - - if (parse_strtoul(buf, 0x40000000, &t)) - return -EINVAL; - - /* inode_readahead_blks must be a power of 2 */ - if (t & (t-1)) - return -EINVAL; - - sbi->s_inode_readahead_blks = t; - return count; -} - -static ssize_t sbi_ui_show(struct ext4_attr *a, - struct ext4_sb_info *sbi, char *buf) -{ - unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset); - - return snprintf(buf, PAGE_SIZE, "%u\n", *ui); -} - -static ssize_t sbi_ui_store(struct ext4_attr *a, - struct ext4_sb_info *sbi, - const char *buf, size_t count) -{ - unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset); - unsigned long t; - - if (parse_strtoul(buf, 0xffffffff, &t)) - return -EINVAL; - *ui = t; - return count; -} - -#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \ -static struct ext4_attr ext4_attr_##_name = { \ - .attr = {.name = __stringify(_name), .mode = _mode }, \ - .show = _show, \ - .store = _store, \ - .offset = offsetof(struct ext4_sb_info, _elname), \ -} -#define EXT4_ATTR(name, mode, show, store) \ -static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store) - -#define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL) -#define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store) -#define EXT4_RW_ATTR_SBI_UI(name, elname) \ - EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname) -#define ATTR_LIST(name) &ext4_attr_##name.attr - -EXT4_RO_ATTR(delayed_allocation_blocks); -EXT4_RO_ATTR(session_write_kbytes); -EXT4_RO_ATTR(lifetime_write_kbytes); -EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show, - inode_readahead_blks_store, s_inode_readahead_blks); -EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats); -EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan); -EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan); -EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); -EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); -EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); - -static struct attribute *ext4_attrs[] = { - ATTR_LIST(delayed_allocation_blocks), - ATTR_LIST(session_write_kbytes), - ATTR_LIST(lifetime_write_kbytes), - ATTR_LIST(inode_readahead_blks), - ATTR_LIST(mb_stats), - ATTR_LIST(mb_max_to_scan), - ATTR_LIST(mb_min_to_scan), - ATTR_LIST(mb_order2_req), - ATTR_LIST(mb_stream_req), - ATTR_LIST(mb_group_prealloc), - NULL, -}; - -static ssize_t ext4_attr_show(struct kobject *kobj, - struct attribute *attr, char *buf) -{ - struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, - s_kobj); - struct ext4_attr *a = container_of(attr, struct ext4_attr, attr); - - return a->show ? a->show(a, sbi, buf) : 0; -} - -static ssize_t ext4_attr_store(struct kobject *kobj, - struct attribute *attr, - const char *buf, size_t len) -{ - struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, - s_kobj); - struct ext4_attr *a = container_of(attr, struct ext4_attr, attr); - - return a->store ? a->store(a, sbi, buf, len) : 0; -} - -static void ext4_sb_release(struct kobject *kobj) -{ - struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, - s_kobj); - complete(&sbi->s_kobj_unregister); -} - - -static struct sysfs_ops ext4_attr_ops = { - .show = ext4_attr_show, - .store = ext4_attr_store, -}; - -static struct kobj_type ext4_ktype = { - .default_attrs = ext4_attrs, - .sysfs_ops = &ext4_attr_ops, - .release = ext4_sb_release, -}; - static int ext4_fill_super(struct super_block *sb, void *data, int silent) __releases(kernel_lock) __acquires(kernel_lock) @@ -2232,21 +2021,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) return -ENOMEM; - - sbi->s_blockgroup_lock = - kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); - if (!sbi->s_blockgroup_lock) { - kfree(sbi); - return -ENOMEM; - } sb->s_fs_info = sbi; sbi->s_mount_opt = 0; sbi->s_resuid = EXT4_DEF_RESUID; sbi->s_resgid = EXT4_DEF_RESGID; sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; sbi->s_sb_block = sb_block; - sbi->s_sectors_written_start = part_stat_read(sb->s_bdev->bd_part, - sectors[1]); unlock_kernel(); @@ -2284,7 +2064,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sb->s_magic = le16_to_cpu(es->s_magic); if (sb->s_magic != EXT4_SUPER_MAGIC) goto cantfind_ext4; - sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written); /* Set defaults before we parse the mount options */ def_mount_opts = le32_to_cpu(es->s_default_mount_opts); @@ -2322,6 +2101,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; + set_opt(sbi->s_mount_opt, RESERVATION); set_opt(sbi->s_mount_opt, BARRIER); /* @@ -2545,9 +2325,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) #ifdef CONFIG_PROC_FS if (ext4_proc_root) sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root); + + if (sbi->s_proc) + proc_create_data("inode_readahead_blks", 0644, sbi->s_proc, + &ext4_ui_proc_fops, + &sbi->s_inode_readahead_blks); #endif - bgl_lock_init(sbi->s_blockgroup_lock); + bgl_lock_init(&sbi->s_blockgroup_lock); for (i = 0; i < db_count; i++) { block = descriptor_loc(sb, logical_sb_block, i); @@ -2779,16 +2564,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount4; } - sbi->s_kobj.kset = ext4_kset; - init_completion(&sbi->s_kobj_unregister); - err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL, - "%s", sb->s_id); - if (err) { - ext4_mb_release(sb); - ext4_ext_release(sb); - goto failed_mount4; - }; - /* * akpm: core read_super() calls in here with the superblock locked. * That deadlocks, because orphan cleanup needs to lock the superblock @@ -2843,6 +2618,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) kfree(sbi->s_group_desc); failed_mount: if (sbi->s_proc) { + remove_proc_entry("inode_readahead_blks", sbi->s_proc); remove_proc_entry(sb->s_id, ext4_proc_root); } #ifdef CONFIG_QUOTA @@ -3137,10 +2913,6 @@ static int ext4_commit_super(struct super_block *sb, set_buffer_uptodate(sbh); } es->s_wtime = cpu_to_le32(get_seconds()); - es->s_kbytes_written = - cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + - ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - - EXT4_SB(sb)->s_sectors_written_start) >> 1)); ext4_free_blocks_count_set(es, percpu_counter_sum_positive( &EXT4_SB(sb)->s_freeblocks_counter)); es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive( @@ -3875,6 +3647,45 @@ static int ext4_get_sb(struct file_system_type *fs_type, return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt); } +#ifdef CONFIG_PROC_FS +static int ext4_ui_proc_show(struct seq_file *m, void *v) +{ + unsigned int *p = m->private; + + seq_printf(m, "%u\n", *p); + return 0; +} + +static int ext4_ui_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, ext4_ui_proc_show, PDE(inode)->data); +} + +static ssize_t ext4_ui_proc_write(struct file *file, const char __user *buf, + size_t cnt, loff_t *ppos) +{ + unsigned long *p = PDE(file->f_path.dentry->d_inode)->data; + char str[32]; + + if (cnt >= sizeof(str)) + return -EINVAL; + if (copy_from_user(str, buf, cnt)) + return -EFAULT; + + *p = simple_strtoul(str, NULL, 0); + return cnt; +} + +const struct file_operations ext4_ui_proc_fops = { + .owner = THIS_MODULE, + .open = ext4_ui_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = ext4_ui_proc_write, +}; +#endif + static struct file_system_type ext4_fs_type = { .owner = THIS_MODULE, .name = "ext4", @@ -3908,9 +3719,6 @@ static int __init init_ext4_fs(void) { int err; - ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj); - if (!ext4_kset) - return -ENOMEM; ext4_proc_root = proc_mkdir("fs/ext4", NULL); err = init_ext4_mballoc(); if (err) @@ -3952,7 +3760,6 @@ static void __exit exit_ext4_fs(void) exit_ext4_xattr(); exit_ext4_mballoc(); remove_proc_entry("fs/ext4", NULL); - kset_unregister(ext4_kset); } MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); diff --git a/trunk/fs/jbd2/commit.c b/trunk/fs/jbd2/commit.c index 4ea72377c7a2..62804e57a44c 100644 --- a/trunk/fs/jbd2/commit.c +++ b/trunk/fs/jbd2/commit.c @@ -367,7 +367,6 @@ void jbd2_journal_commit_transaction(journal_t *journal) int tag_bytes = journal_tag_bytes(journal); struct buffer_head *cbh = NULL; /* For transactional checksums */ __u32 crc32_sum = ~0; - int write_op = WRITE; /* * First job: lock down the current transaction and wait for @@ -402,8 +401,6 @@ void jbd2_journal_commit_transaction(journal_t *journal) spin_lock(&journal->j_state_lock); commit_transaction->t_state = T_LOCKED; - if (commit_transaction->t_synchronous_commit) - write_op = WRITE_SYNC; stats.u.run.rs_wait = commit_transaction->t_max_wait; stats.u.run.rs_locked = jiffies; stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start, @@ -683,7 +680,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) clear_buffer_dirty(bh); set_buffer_uptodate(bh); bh->b_end_io = journal_end_buffer_io_sync; - submit_bh(write_op, bh); + submit_bh(WRITE, bh); } cond_resched(); stats.u.run.rs_blocks_logged += bufs; diff --git a/trunk/fs/jbd2/revoke.c b/trunk/fs/jbd2/revoke.c index bbe6d592d8b3..257ff2625765 100644 --- a/trunk/fs/jbd2/revoke.c +++ b/trunk/fs/jbd2/revoke.c @@ -55,25 +55,6 @@ * need do nothing. * RevokeValid set, Revoked set: * buffer has been revoked. - * - * Locking rules: - * We keep two hash tables of revoke records. One hashtable belongs to the - * running transaction (is pointed to by journal->j_revoke), the other one - * belongs to the committing transaction. Accesses to the second hash table - * happen only from the kjournald and no other thread touches this table. Also - * journal_switch_revoke_table() which switches which hashtable belongs to the - * running and which to the committing transaction is called only from - * kjournald. Therefore we need no locks when accessing the hashtable belonging - * to the committing transaction. - * - * All users operating on the hash table belonging to the running transaction - * have a handle to the transaction. Therefore they are safe from kjournald - * switching hash tables under them. For operations on the lists of entries in - * the hash table j_revoke_lock is used. - * - * Finally, also replay code uses the hash tables but at this moment noone else - * can touch them (filesystem isn't mounted yet) and hence no locking is - * needed. */ #ifndef __KERNEL__ @@ -420,6 +401,8 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr, * the second time we would still have a pending revoke to cancel. So, * do not trust the Revoked bit on buffers unless RevokeValid is also * set. + * + * The caller must have the journal locked. */ int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh) { @@ -497,7 +480,10 @@ void jbd2_journal_switch_revoke_table(journal_t *journal) /* * Write revoke records to the journal for all entries in the current * revoke hash, deleting the entries as we go. + * + * Called with the journal lock held. */ + void jbd2_journal_write_revoke_records(journal_t *journal, transaction_t *transaction) { diff --git a/trunk/fs/jbd2/transaction.c b/trunk/fs/jbd2/transaction.c index 996ffda06bf3..28ce21d8598e 100644 --- a/trunk/fs/jbd2/transaction.c +++ b/trunk/fs/jbd2/transaction.c @@ -1315,8 +1315,6 @@ int jbd2_journal_stop(handle_t *handle) } } - if (handle->h_sync) - transaction->t_synchronous_commit = 1; current->journal_info = NULL; spin_lock(&journal->j_state_lock); spin_lock(&transaction->t_handle_lock); diff --git a/trunk/fs/lockd/clntlock.c b/trunk/fs/lockd/clntlock.c index 1f3b0fc0d351..aedc47a264c1 100644 --- a/trunk/fs/lockd/clntlock.c +++ b/trunk/fs/lockd/clntlock.c @@ -139,6 +139,55 @@ int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout) return 0; } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +static const struct in6_addr *nlmclnt_map_v4addr(const struct sockaddr *sap, + struct in6_addr *addr_mapped) +{ + const struct sockaddr_in *sin = (const struct sockaddr_in *)sap; + + switch (sap->sa_family) { + case AF_INET6: + return &((const struct sockaddr_in6 *)sap)->sin6_addr; + case AF_INET: + ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, addr_mapped); + return addr_mapped; + } + + return NULL; +} + +/* + * If lockd is using a PF_INET6 listener, all incoming requests appear + * to come from AF_INET6 remotes. The address of AF_INET remotes are + * mapped to AF_INET6 automatically by the network layer. In case the + * user passed an AF_INET server address at mount time, ensure both + * addresses are AF_INET6 before comparing them. + */ +static int nlmclnt_cmp_addr(const struct nlm_host *host, + const struct sockaddr *sap) +{ + const struct in6_addr *addr1; + const struct in6_addr *addr2; + struct in6_addr addr1_mapped; + struct in6_addr addr2_mapped; + + addr1 = nlmclnt_map_v4addr(nlm_addr(host), &addr1_mapped); + if (likely(addr1 != NULL)) { + addr2 = nlmclnt_map_v4addr(sap, &addr2_mapped); + if (likely(addr2 != NULL)) + return ipv6_addr_equal(addr1, addr2); + } + + return 0; +} +#else /* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */ +static int nlmclnt_cmp_addr(const struct nlm_host *host, + const struct sockaddr *sap) +{ + return nlm_cmp_addr(nlm_addr(host), sap); +} +#endif /* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */ + /* * The server lockd has called us back to tell us the lock was granted */ @@ -166,7 +215,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock) */ if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid) continue; - if (!nlm_cmp_addr(nlm_addr(block->b_host), addr)) + if (!nlmclnt_cmp_addr(block->b_host, addr)) continue; if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0) continue; diff --git a/trunk/fs/lockd/mon.c b/trunk/fs/lockd/mon.c index 6d5d4a4169e5..5e2c4d5ac827 100644 --- a/trunk/fs/lockd/mon.c +++ b/trunk/fs/lockd/mon.c @@ -16,8 +16,6 @@ #include #include -#include - #define NLMDBG_FACILITY NLMDBG_MONITOR #define NSM_PROGRAM 100024 #define NSM_VERSION 1 @@ -276,12 +274,10 @@ static void nsm_init_private(struct nsm_handle *nsm) { u64 *p = (u64 *)&nsm->sm_priv.data; struct timespec ts; - s64 ns; ktime_get_ts(&ts); - ns = timespec_to_ns(&ts); - put_unaligned(ns, p); - put_unaligned((unsigned long)nsm, p + 1); + *p++ = timespec_to_ns(&ts); + *p = (unsigned long)nsm; } static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap, diff --git a/trunk/fs/lockd/svc.c b/trunk/fs/lockd/svc.c index abf83881f68a..64f1c31b5853 100644 --- a/trunk/fs/lockd/svc.c +++ b/trunk/fs/lockd/svc.c @@ -52,6 +52,17 @@ static struct task_struct *nlmsvc_task; static struct svc_rqst *nlmsvc_rqst; unsigned long nlmsvc_timeout; +/* + * If the kernel has IPv6 support available, always listen for + * both AF_INET and AF_INET6 requests. + */ +#if (defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)) && \ + defined(CONFIG_SUNRPC_REGISTER_V4) +static const sa_family_t nlmsvc_family = AF_INET6; +#else /* (CONFIG_IPV6 || CONFIG_IPV6_MODULE) && CONFIG_SUNRPC_REGISTER_V4 */ +static const sa_family_t nlmsvc_family = AF_INET; +#endif /* (CONFIG_IPV6 || CONFIG_IPV6_MODULE) && CONFIG_SUNRPC_REGISTER_V4 */ + /* * These can be set at insmod time (useful for NFS as root filesystem), * and also changed through the sysctl interface. -- Jamie Lokier, Aug 2003 @@ -193,30 +204,19 @@ lockd(void *vrqstp) return 0; } -static int create_lockd_listener(struct svc_serv *serv, const char *name, - const int family, const unsigned short port) +static int create_lockd_listener(struct svc_serv *serv, char *name, + unsigned short port) { struct svc_xprt *xprt; - xprt = svc_find_xprt(serv, name, family, 0); + xprt = svc_find_xprt(serv, name, 0, 0); if (xprt == NULL) - return svc_create_xprt(serv, name, family, port, - SVC_SOCK_DEFAULTS); + return svc_create_xprt(serv, name, port, SVC_SOCK_DEFAULTS); + svc_xprt_put(xprt); return 0; } -static int create_lockd_family(struct svc_serv *serv, const int family) -{ - int err; - - err = create_lockd_listener(serv, "udp", family, nlm_udpport); - if (err < 0) - return err; - - return create_lockd_listener(serv, "tcp", family, nlm_tcpport); -} - /* * Ensure there are active UDP and TCP listeners for lockd. * @@ -232,15 +232,13 @@ static int make_socks(struct svc_serv *serv) static int warned; int err; - err = create_lockd_family(serv, PF_INET); + err = create_lockd_listener(serv, "udp", nlm_udpport); if (err < 0) goto out_err; -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) - err = create_lockd_family(serv, PF_INET6); - if (err < 0 && err != -EAFNOSUPPORT) + err = create_lockd_listener(serv, "tcp", nlm_tcpport); + if (err < 0) goto out_err; -#endif /* CONFIG_IPV6 || CONFIG_IPV6_MODULE */ warned = 0; return 0; @@ -276,7 +274,7 @@ int lockd_up(void) "lockd_up: no pid, %d users??\n", nlmsvc_users); error = -ENOMEM; - serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, NULL); + serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, nlmsvc_family, NULL); if (!serv) { printk(KERN_WARNING "lockd_up: create service failed\n"); goto out; diff --git a/trunk/fs/nfs/callback.c b/trunk/fs/nfs/callback.c index a886e692ddd0..3e634f2a1083 100644 --- a/trunk/fs/nfs/callback.c +++ b/trunk/fs/nfs/callback.c @@ -38,10 +38,19 @@ static struct svc_program nfs4_callback_program; unsigned int nfs_callback_set_tcpport; unsigned short nfs_callback_tcpport; -unsigned short nfs_callback_tcpport6; static const int nfs_set_port_min = 0; static const int nfs_set_port_max = 65535; +/* + * If the kernel has IPv6 support available, always listen for + * both AF_INET and AF_INET6 requests. + */ +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +static const sa_family_t nfs_callback_family = AF_INET6; +#else +static const sa_family_t nfs_callback_family = AF_INET; +#endif + static int param_set_port(const char *val, struct kernel_param *kp) { char *endp; @@ -107,29 +116,19 @@ int nfs_callback_up(void) mutex_lock(&nfs_callback_mutex); if (nfs_callback_info.users++ || nfs_callback_info.task != NULL) goto out; - serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL); + serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, + nfs_callback_family, NULL); ret = -ENOMEM; if (!serv) goto out_err; - ret = svc_create_xprt(serv, "tcp", PF_INET, - nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS); + ret = svc_create_xprt(serv, "tcp", nfs_callback_set_tcpport, + SVC_SOCK_ANONYMOUS); if (ret <= 0) goto out_err; nfs_callback_tcpport = ret; dprintk("NFS: Callback listener port = %u (af %u)\n", - nfs_callback_tcpport, PF_INET); - -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) - ret = svc_create_xprt(serv, "tcp", PF_INET6, - nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS); - if (ret > 0) { - nfs_callback_tcpport6 = ret; - dprintk("NFS: Callback listener port = %u (af %u)\n", - nfs_callback_tcpport6, PF_INET6); - } else if (ret != -EAFNOSUPPORT) - goto out_err; -#endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */ + nfs_callback_tcpport, nfs_callback_family); nfs_callback_info.rqst = svc_prepare_thread(serv, &serv->sv_pools[0]); if (IS_ERR(nfs_callback_info.rqst)) { diff --git a/trunk/fs/nfs/callback.h b/trunk/fs/nfs/callback.h index e110e286a262..bb25d2135ff1 100644 --- a/trunk/fs/nfs/callback.h +++ b/trunk/fs/nfs/callback.h @@ -72,6 +72,5 @@ extern void nfs_callback_down(void); extern unsigned int nfs_callback_set_tcpport; extern unsigned short nfs_callback_tcpport; -extern unsigned short nfs_callback_tcpport6; #endif /* __LINUX_FS_NFS_CALLBACK_H */ diff --git a/trunk/fs/nfs/client.c b/trunk/fs/nfs/client.c index aba38017bdef..2277421656e7 100644 --- a/trunk/fs/nfs/client.c +++ b/trunk/fs/nfs/client.c @@ -224,6 +224,38 @@ void nfs_put_client(struct nfs_client *clp) } #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +static const struct in6_addr *nfs_map_ipv4_addr(const struct sockaddr *sa, struct in6_addr *addr_mapped) +{ + switch (sa->sa_family) { + default: + return NULL; + case AF_INET6: + return &((const struct sockaddr_in6 *)sa)->sin6_addr; + break; + case AF_INET: + ipv6_addr_set_v4mapped(((const struct sockaddr_in *)sa)->sin_addr.s_addr, + addr_mapped); + return addr_mapped; + } +} + +static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1, + const struct sockaddr *sa2) +{ + const struct in6_addr *addr1; + const struct in6_addr *addr2; + struct in6_addr addr1_mapped; + struct in6_addr addr2_mapped; + + addr1 = nfs_map_ipv4_addr(sa1, &addr1_mapped); + if (likely(addr1 != NULL)) { + addr2 = nfs_map_ipv4_addr(sa2, &addr2_mapped); + if (likely(addr2 != NULL)) + return ipv6_addr_equal(addr1, addr2); + } + return 0; +} + /* * Test if two ip6 socket addresses refer to the same socket by * comparing relevant fields. The padding bytes specifically, are not @@ -235,21 +267,38 @@ void nfs_put_client(struct nfs_client *clp) * * The caller should ensure both socket addresses are AF_INET6. */ -static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1, - const struct sockaddr *sa2) +static int nfs_sockaddr_cmp_ip6(const struct sockaddr *sa1, + const struct sockaddr *sa2) { - const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1; - const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2; + const struct sockaddr_in6 *saddr1 = (const struct sockaddr_in6 *)sa1; + const struct sockaddr_in6 *saddr2 = (const struct sockaddr_in6 *)sa2; - if (ipv6_addr_scope(&sin1->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL && - sin1->sin6_scope_id != sin2->sin6_scope_id) + if (!ipv6_addr_equal(&saddr1->sin6_addr, + &saddr1->sin6_addr)) return 0; + if (ipv6_addr_scope(&saddr1->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL && + saddr1->sin6_scope_id != saddr2->sin6_scope_id) + return 0; + return saddr1->sin6_port == saddr2->sin6_port; +} +#else +static int nfs_sockaddr_match_ipaddr4(const struct sockaddr_in *sa1, + const struct sockaddr_in *sa2) +{ + return sa1->sin_addr.s_addr == sa2->sin_addr.s_addr; +} - return ipv6_addr_equal(&sin1->sin6_addr, &sin1->sin6_addr); +static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1, + const struct sockaddr *sa2) +{ + if (unlikely(sa1->sa_family != AF_INET || sa2->sa_family != AF_INET)) + return 0; + return nfs_sockaddr_match_ipaddr4((const struct sockaddr_in *)sa1, + (const struct sockaddr_in *)sa2); } -#else /* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */ -static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1, - const struct sockaddr *sa2) + +static int nfs_sockaddr_cmp_ip6(const struct sockaddr * sa1, + const struct sockaddr * sa2) { return 0; } @@ -262,57 +311,20 @@ static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1, * * The caller should ensure both socket addresses are AF_INET. */ -static int nfs_sockaddr_match_ipaddr4(const struct sockaddr *sa1, - const struct sockaddr *sa2) -{ - const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1; - const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2; - - return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr; -} - -static int nfs_sockaddr_cmp_ip6(const struct sockaddr *sa1, - const struct sockaddr *sa2) -{ - const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1; - const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2; - - return nfs_sockaddr_match_ipaddr6(sa1, sa2) && - (sin1->sin6_port == sin2->sin6_port); -} - static int nfs_sockaddr_cmp_ip4(const struct sockaddr *sa1, const struct sockaddr *sa2) { - const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1; - const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2; + const struct sockaddr_in *saddr1 = (const struct sockaddr_in *)sa1; + const struct sockaddr_in *saddr2 = (const struct sockaddr_in *)sa2; - return nfs_sockaddr_match_ipaddr4(sa1, sa2) && - (sin1->sin_port == sin2->sin_port); -} - -/* - * Test if two socket addresses represent the same actual socket, - * by comparing (only) relevant fields, excluding the port number. - */ -static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1, - const struct sockaddr *sa2) -{ - if (sa1->sa_family != sa2->sa_family) + if (saddr1->sin_addr.s_addr != saddr2->sin_addr.s_addr) return 0; - - switch (sa1->sa_family) { - case AF_INET: - return nfs_sockaddr_match_ipaddr4(sa1, sa2); - case AF_INET6: - return nfs_sockaddr_match_ipaddr6(sa1, sa2); - } - return 0; + return saddr1->sin_port == saddr2->sin_port; } /* * Test if two socket addresses represent the same actual socket, - * by comparing (only) relevant fields, including the port number. + * by comparing (only) relevant fields. */ static int nfs_sockaddr_cmp(const struct sockaddr *sa1, const struct sockaddr *sa2) diff --git a/trunk/fs/nfs/dir.c b/trunk/fs/nfs/dir.c index 370b190a09d1..78bf72fc1db3 100644 --- a/trunk/fs/nfs/dir.c +++ b/trunk/fs/nfs/dir.c @@ -1624,7 +1624,8 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, } else if (atomic_read(&new_dentry->d_count) > 1) /* dentry still busy? */ goto out; - } + } else + nfs_drop_nlink(new_inode); go_ahead: /* @@ -1637,8 +1638,10 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, } nfs_inode_return_delegation(old_inode); - if (new_inode != NULL) + if (new_inode != NULL) { nfs_inode_return_delegation(new_inode); + d_delete(new_dentry); + } error = NFS_PROTO(old_dir)->rename(old_dir, &old_dentry->d_name, new_dir, &new_dentry->d_name); @@ -1647,8 +1650,6 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (rehash) d_rehash(rehash); if (!error) { - if (new_inode != NULL) - nfs_drop_nlink(new_inode); d_move(old_dentry, new_dentry); nfs_set_verifier(new_dentry, nfs_save_change_attribute(new_dir)); diff --git a/trunk/fs/nfs/file.c b/trunk/fs/nfs/file.c index 0abf3f331f56..cec79392e4ba 100644 --- a/trunk/fs/nfs/file.c +++ b/trunk/fs/nfs/file.c @@ -64,7 +64,11 @@ const struct file_operations nfs_file_operations = { .write = do_sync_write, .aio_read = nfs_file_read, .aio_write = nfs_file_write, +#ifdef CONFIG_MMU .mmap = nfs_file_mmap, +#else + .mmap = generic_file_mmap, +#endif .open = nfs_file_open, .flush = nfs_file_flush, .release = nfs_file_release, @@ -137,6 +141,9 @@ nfs_file_release(struct inode *inode, struct file *filp) dentry->d_parent->d_name.name, dentry->d_name.name); + /* Ensure that dirty pages are flushed out with the right creds */ + if (filp->f_mode & FMODE_WRITE) + nfs_wb_all(dentry->d_inode); nfs_inc_stats(inode, NFSIOS_VFSRELEASE); return nfs_release(inode, filp); } @@ -228,6 +235,7 @@ nfs_file_flush(struct file *file, fl_owner_t id) struct nfs_open_context *ctx = nfs_file_open_context(file); struct dentry *dentry = file->f_path.dentry; struct inode *inode = dentry->d_inode; + int status; dprintk("NFS: flush(%s/%s)\n", dentry->d_parent->d_name.name, @@ -237,8 +245,11 @@ nfs_file_flush(struct file *file, fl_owner_t id) return 0; nfs_inc_stats(inode, NFSIOS_VFSFLUSH); - /* Flush writes to the server and return any errors */ - return nfs_do_fsync(ctx, inode); + /* Ensure that data+attribute caches are up to date after close() */ + status = nfs_do_fsync(ctx, inode); + if (!status) + nfs_revalidate_inode(NFS_SERVER(inode), inode); + return status; } static ssize_t @@ -293,13 +304,11 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma) dprintk("NFS: mmap(%s/%s)\n", dentry->d_parent->d_name.name, dentry->d_name.name); - /* Note: generic_file_mmap() returns ENOSYS on nommu systems - * so we call that before revalidating the mapping - */ - status = generic_file_mmap(file, vma); + status = nfs_revalidate_mapping(inode, file->f_mapping); if (!status) { vma->vm_ops = &nfs_file_vm_ops; - status = nfs_revalidate_mapping(inode, file->f_mapping); + vma->vm_flags |= VM_CAN_NONLINEAR; + file_accessed(file); } return status; } @@ -345,15 +354,6 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping, file->f_path.dentry->d_name.name, mapping->host->i_ino, len, (long long) pos); - /* - * Prevent starvation issues if someone is doing a consistency - * sync-to-disk - */ - ret = wait_on_bit(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING, - nfs_wait_bit_killable, TASK_KILLABLE); - if (ret) - return ret; - page = grab_cache_page_write_begin(mapping, index, flags); if (!page) return -ENOMEM; diff --git a/trunk/fs/nfs/getroot.c b/trunk/fs/nfs/getroot.c index 46177cb87064..b7c9b2df1f29 100644 --- a/trunk/fs/nfs/getroot.c +++ b/trunk/fs/nfs/getroot.c @@ -156,7 +156,7 @@ int nfs4_path_walk(struct nfs_server *server, return ret; } - if (!S_ISDIR(fattr.mode)) { + if (fattr.type != NFDIR) { printk(KERN_ERR "nfs4_get_root:" " getroot encountered non-directory\n"); return -ENOTDIR; @@ -213,7 +213,7 @@ int nfs4_path_walk(struct nfs_server *server, return ret; } - if (!S_ISDIR(fattr.mode)) { + if (fattr.type != NFDIR) { printk(KERN_ERR "nfs4_get_root:" " lookupfh encountered non-directory\n"); return -ENOTDIR; diff --git a/trunk/fs/nfs/inode.c b/trunk/fs/nfs/inode.c index a834d1d850b7..0c381686171e 100644 --- a/trunk/fs/nfs/inode.c +++ b/trunk/fs/nfs/inode.c @@ -65,18 +65,6 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr) return nfs_fileid_to_ino_t(fattr->fileid); } -/** - * nfs_wait_bit_killable - helper for functions that are sleeping on bit locks - * @word: long word containing the bit lock - */ -int nfs_wait_bit_killable(void *word) -{ - if (fatal_signal_pending(current)) - return -ERESTARTSYS; - schedule(); - return 0; -} - /** * nfs_compat_user_ino64 - returns the user-visible inode number * @fileid: 64-bit fileid @@ -261,10 +249,13 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) struct inode *inode = ERR_PTR(-ENOENT); unsigned long hash; - if ((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0) + if ((fattr->valid & NFS_ATTR_FATTR) == 0) goto out_no_inode; - if ((fattr->valid & NFS_ATTR_FATTR_TYPE) == 0) + + if (!fattr->nlink) { + printk("NFS: Buggy server - nlink == 0!\n"); goto out_no_inode; + } hash = nfs_fattr_to_ino_t(fattr); @@ -300,8 +291,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) && fattr->size <= NFS_LIMIT_READDIRPLUS) set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags); /* Deal with crossing mountpoints */ - if ((fattr->valid & NFS_ATTR_FATTR_FSID) - && !nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) { + if (!nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) { if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) inode->i_op = &nfs_referral_inode_operations; else @@ -314,45 +304,28 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) else init_special_inode(inode, inode->i_mode, fattr->rdev); - memset(&inode->i_atime, 0, sizeof(inode->i_atime)); - memset(&inode->i_mtime, 0, sizeof(inode->i_mtime)); - memset(&inode->i_ctime, 0, sizeof(inode->i_ctime)); - nfsi->change_attr = 0; - inode->i_size = 0; - inode->i_nlink = 0; - inode->i_uid = -2; - inode->i_gid = -2; - inode->i_blocks = 0; - memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf)); - nfsi->read_cache_jiffies = fattr->time_start; nfsi->attr_gencount = fattr->gencount; - if (fattr->valid & NFS_ATTR_FATTR_ATIME) - inode->i_atime = fattr->atime; - if (fattr->valid & NFS_ATTR_FATTR_MTIME) - inode->i_mtime = fattr->mtime; - if (fattr->valid & NFS_ATTR_FATTR_CTIME) - inode->i_ctime = fattr->ctime; - if (fattr->valid & NFS_ATTR_FATTR_CHANGE) + inode->i_atime = fattr->atime; + inode->i_mtime = fattr->mtime; + inode->i_ctime = fattr->ctime; + if (fattr->valid & NFS_ATTR_FATTR_V4) nfsi->change_attr = fattr->change_attr; - if (fattr->valid & NFS_ATTR_FATTR_SIZE) - inode->i_size = nfs_size_to_loff_t(fattr->size); - if (fattr->valid & NFS_ATTR_FATTR_NLINK) - inode->i_nlink = fattr->nlink; - if (fattr->valid & NFS_ATTR_FATTR_OWNER) - inode->i_uid = fattr->uid; - if (fattr->valid & NFS_ATTR_FATTR_GROUP) - inode->i_gid = fattr->gid; - if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) - inode->i_blocks = fattr->du.nfs2.blocks; - if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { + inode->i_size = nfs_size_to_loff_t(fattr->size); + inode->i_nlink = fattr->nlink; + inode->i_uid = fattr->uid; + inode->i_gid = fattr->gid; + if (fattr->valid & (NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4)) { /* * report the blocks in 512byte units */ inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used); + } else { + inode->i_blocks = fattr->du.nfs2.blocks; } nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); nfsi->attrtimeo_timestamp = now; + memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf)); nfsi->access_cache = RB_ROOT; unlock_new_inode(inode); @@ -541,32 +514,6 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) return err; } -/** - * nfs_close_context - Common close_context() routine NFSv2/v3 - * @ctx: pointer to context - * @is_sync: is this a synchronous close - * - * always ensure that the attributes are up to date if we're mounted - * with close-to-open semantics - */ -void nfs_close_context(struct nfs_open_context *ctx, int is_sync) -{ - struct inode *inode; - struct nfs_server *server; - - if (!(ctx->mode & FMODE_WRITE)) - return; - if (!is_sync) - return; - inode = ctx->path.dentry->d_inode; - if (!list_empty(&NFS_I(inode)->open_files)) - return; - server = NFS_SERVER(inode); - if (server->flags & NFS_MOUNT_NOCTO) - return; - nfs_revalidate_inode(server, inode); -} - static struct nfs_open_context *alloc_nfs_open_context(struct vfsmount *mnt, struct dentry *dentry, struct rpc_cred *cred) { struct nfs_open_context *ctx; @@ -593,15 +540,24 @@ struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx) return ctx; } -static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync) +static void __put_nfs_open_context(struct nfs_open_context *ctx, int wait) { - struct inode *inode = ctx->path.dentry->d_inode; + struct inode *inode; + + if (ctx == NULL) + return; + inode = ctx->path.dentry->d_inode; if (!atomic_dec_and_lock(&ctx->count, &inode->i_lock)) return; list_del(&ctx->list); spin_unlock(&inode->i_lock); - NFS_PROTO(inode)->close_context(ctx, is_sync); + if (ctx->state != NULL) { + if (wait) + nfs4_close_sync(&ctx->path, ctx->state, ctx->mode); + else + nfs4_close_state(&ctx->path, ctx->state, ctx->mode); + } if (ctx->cred != NULL) put_rpccred(ctx->cred); path_put(&ctx->path); @@ -714,6 +670,9 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) if (NFS_STALE(inode)) goto out; + if (NFS_STALE(inode)) + goto out; + nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE); status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), &fattr); if (status != 0) { @@ -856,31 +815,25 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) { struct nfs_inode *nfsi = NFS_I(inode); - if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE) - && (fattr->valid & NFS_ATTR_FATTR_CHANGE) - && nfsi->change_attr == fattr->pre_change_attr) { + if ((fattr->valid & NFS_ATTR_WCC_V4) != 0 && + nfsi->change_attr == fattr->pre_change_attr) { nfsi->change_attr = fattr->change_attr; if (S_ISDIR(inode->i_mode)) nfsi->cache_validity |= NFS_INO_INVALID_DATA; } /* If we have atomic WCC data, we may update some attributes */ - if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME) - && (fattr->valid & NFS_ATTR_FATTR_CTIME) - && timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) + if ((fattr->valid & NFS_ATTR_WCC) != 0) { + if (timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); - - if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME) - && (fattr->valid & NFS_ATTR_FATTR_MTIME) - && timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) { + if (timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) { memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); if (S_ISDIR(inode->i_mode)) nfsi->cache_validity |= NFS_INO_INVALID_DATA; - } - if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE) - && (fattr->valid & NFS_ATTR_FATTR_SIZE) - && i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size) - && nfsi->npages == 0) + } + if (i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size) && + nfsi->npages == 0) i_size_write(inode, nfs_size_to_loff_t(fattr->size)); + } } /** @@ -900,39 +853,35 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat /* Has the inode gone and changed behind our back? */ - if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid) - return -EIO; - if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) + if (nfsi->fileid != fattr->fileid + || (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) { return -EIO; + } - if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && + if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 && nfsi->change_attr != fattr->change_attr) invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; /* Verify a few of the more important attributes */ - if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime)) + if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; - if (fattr->valid & NFS_ATTR_FATTR_SIZE) { - cur_size = i_size_read(inode); - new_isize = nfs_size_to_loff_t(fattr->size); - if (cur_size != new_isize && nfsi->npages == 0) - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; - } + cur_size = i_size_read(inode); + new_isize = nfs_size_to_loff_t(fattr->size); + if (cur_size != new_isize && nfsi->npages == 0) + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; /* Have any file permissions changed? */ - if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) - invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; - if ((fattr->valid & NFS_ATTR_FATTR_OWNER) && inode->i_uid != fattr->uid) - invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; - if ((fattr->valid & NFS_ATTR_FATTR_GROUP) && inode->i_gid != fattr->gid) + if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO) + || inode->i_uid != fattr->uid + || inode->i_gid != fattr->gid) invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; /* Has the link count changed? */ - if ((fattr->valid & NFS_ATTR_FATTR_NLINK) && inode->i_nlink != fattr->nlink) + if (inode->i_nlink != fattr->nlink) invalid |= NFS_INO_INVALID_ATTR; - if ((fattr->valid & NFS_ATTR_FATTR_ATIME) && !timespec_equal(&inode->i_atime, &fattr->atime)) + if (!timespec_equal(&inode->i_atime, &fattr->atime)) invalid |= NFS_INO_INVALID_ATIME; if (invalid != 0) @@ -944,15 +893,11 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat static int nfs_ctime_need_update(const struct inode *inode, const struct nfs_fattr *fattr) { - if (!(fattr->valid & NFS_ATTR_FATTR_CTIME)) - return 0; return timespec_compare(&fattr->ctime, &inode->i_ctime) > 0; } static int nfs_size_need_update(const struct inode *inode, const struct nfs_fattr *fattr) { - if (!(fattr->valid & NFS_ATTR_FATTR_SIZE)) - return 0; return nfs_size_to_loff_t(fattr->size) > i_size_read(inode); } @@ -1088,31 +1033,20 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa /* Don't do a WCC update if these attributes are already stale */ if ((fattr->valid & NFS_ATTR_FATTR) == 0 || !nfs_inode_attrs_need_update(inode, fattr)) { - fattr->valid &= ~(NFS_ATTR_FATTR_PRECHANGE - | NFS_ATTR_FATTR_PRESIZE - | NFS_ATTR_FATTR_PREMTIME - | NFS_ATTR_FATTR_PRECTIME); + fattr->valid &= ~(NFS_ATTR_WCC_V4|NFS_ATTR_WCC); goto out_noforce; } - if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && - (fattr->valid & NFS_ATTR_FATTR_PRECHANGE) == 0) { + if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 && + (fattr->valid & NFS_ATTR_WCC_V4) == 0) { fattr->pre_change_attr = NFS_I(inode)->change_attr; - fattr->valid |= NFS_ATTR_FATTR_PRECHANGE; + fattr->valid |= NFS_ATTR_WCC_V4; } - if ((fattr->valid & NFS_ATTR_FATTR_CTIME) != 0 && - (fattr->valid & NFS_ATTR_FATTR_PRECTIME) == 0) { + if ((fattr->valid & NFS_ATTR_FATTR) != 0 && + (fattr->valid & NFS_ATTR_WCC) == 0) { memcpy(&fattr->pre_ctime, &inode->i_ctime, sizeof(fattr->pre_ctime)); - fattr->valid |= NFS_ATTR_FATTR_PRECTIME; - } - if ((fattr->valid & NFS_ATTR_FATTR_MTIME) != 0 && - (fattr->valid & NFS_ATTR_FATTR_PREMTIME) == 0) { memcpy(&fattr->pre_mtime, &inode->i_mtime, sizeof(fattr->pre_mtime)); - fattr->valid |= NFS_ATTR_FATTR_PREMTIME; - } - if ((fattr->valid & NFS_ATTR_FATTR_SIZE) != 0 && - (fattr->valid & NFS_ATTR_FATTR_PRESIZE) == 0) { fattr->pre_size = i_size_read(inode); - fattr->valid |= NFS_ATTR_FATTR_PRESIZE; + fattr->valid |= NFS_ATTR_WCC; } out_noforce: status = nfs_post_op_update_inode_locked(inode, fattr); @@ -1144,18 +1078,18 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) __func__, inode->i_sb->s_id, inode->i_ino, atomic_read(&inode->i_count), fattr->valid); - if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid) + if (nfsi->fileid != fattr->fileid) goto out_fileid; /* * Make sure the inode's type hasn't changed. */ - if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) + if ((inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) goto out_changed; server = NFS_SERVER(inode); /* Update the fsid? */ - if (S_ISDIR(inode->i_mode) && (fattr->valid & NFS_ATTR_FATTR_FSID) && + if (S_ISDIR(inode->i_mode) && !nfs_fsid_equal(&server->fsid, &fattr->fsid) && !test_bit(NFS_INO_MOUNTPOINT, &nfsi->flags)) server->fsid = fattr->fsid; @@ -1165,27 +1099,14 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) */ nfsi->read_cache_jiffies = fattr->time_start; - if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) || (fattr->valid & (NFS_ATTR_FATTR_MTIME|NFS_ATTR_FATTR_CTIME))) - nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR - | NFS_INO_INVALID_ATIME - | NFS_INO_REVAL_PAGECACHE); + nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ATIME + | NFS_INO_REVAL_PAGECACHE); /* Do atomic weak cache consistency updates */ nfs_wcc_update_inode(inode, fattr); /* More cache consistency checks */ - if (fattr->valid & NFS_ATTR_FATTR_CHANGE) { - if (nfsi->change_attr != fattr->change_attr) { - dprintk("NFS: change_attr change on server for file %s/%ld\n", - inode->i_sb->s_id, inode->i_ino); - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; - if (S_ISDIR(inode->i_mode)) - nfs_force_lookup_revalidate(inode); - nfsi->change_attr = fattr->change_attr; - } - } - - if (fattr->valid & NFS_ATTR_FATTR_MTIME) { + if (!(fattr->valid & NFS_ATTR_FATTR_V4)) { /* NFSv2/v3: Check if the mtime agrees */ if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) { dprintk("NFS: mtime change on server for file %s/%ld\n", @@ -1193,80 +1114,59 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; if (S_ISDIR(inode->i_mode)) nfs_force_lookup_revalidate(inode); - memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); } - } - if (fattr->valid & NFS_ATTR_FATTR_CTIME) { /* If ctime has changed we should definitely clear access+acl caches */ - if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) { + if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; - /* and probably clear data for a directory too as utimes can cause - * havoc with our cache. - */ - if (S_ISDIR(inode->i_mode)) { - invalid |= NFS_INO_INVALID_DATA; - nfs_force_lookup_revalidate(inode); - } - memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); - } + } else if (nfsi->change_attr != fattr->change_attr) { + dprintk("NFS: change_attr change on server for file %s/%ld\n", + inode->i_sb->s_id, inode->i_ino); + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + if (S_ISDIR(inode->i_mode)) + nfs_force_lookup_revalidate(inode); } /* Check if our cached file size is stale */ - if (fattr->valid & NFS_ATTR_FATTR_SIZE) { - new_isize = nfs_size_to_loff_t(fattr->size); - cur_isize = i_size_read(inode); - if (new_isize != cur_isize) { - /* Do we perhaps have any outstanding writes, or has - * the file grown beyond our last write? */ - if (nfsi->npages == 0 || new_isize > cur_isize) { - i_size_write(inode, new_isize); - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; - } - dprintk("NFS: isize change on server for file %s/%ld\n", - inode->i_sb->s_id, inode->i_ino); + new_isize = nfs_size_to_loff_t(fattr->size); + cur_isize = i_size_read(inode); + if (new_isize != cur_isize) { + /* Do we perhaps have any outstanding writes, or has + * the file grown beyond our last write? */ + if (nfsi->npages == 0 || new_isize > cur_isize) { + i_size_write(inode, new_isize); + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; } + dprintk("NFS: isize change on server for file %s/%ld\n", + inode->i_sb->s_id, inode->i_ino); } - if (fattr->valid & NFS_ATTR_FATTR_ATIME) - memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime)); + memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); + memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); + memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime)); + nfsi->change_attr = fattr->change_attr; - if (fattr->valid & NFS_ATTR_FATTR_MODE) { - if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) { - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; - inode->i_mode = fattr->mode; - } - } - if (fattr->valid & NFS_ATTR_FATTR_OWNER) { - if (inode->i_uid != fattr->uid) { - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; - inode->i_uid = fattr->uid; - } - } - if (fattr->valid & NFS_ATTR_FATTR_GROUP) { - if (inode->i_gid != fattr->gid) { - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; - inode->i_gid = fattr->gid; - } - } + if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO) || + inode->i_uid != fattr->uid || + inode->i_gid != fattr->gid) + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; - if (fattr->valid & NFS_ATTR_FATTR_NLINK) { - if (inode->i_nlink != fattr->nlink) { - invalid |= NFS_INO_INVALID_ATTR; - if (S_ISDIR(inode->i_mode)) - invalid |= NFS_INO_INVALID_DATA; - inode->i_nlink = fattr->nlink; - } - } + if (inode->i_nlink != fattr->nlink) + invalid |= NFS_INO_INVALID_ATTR; + + inode->i_mode = fattr->mode; + inode->i_nlink = fattr->nlink; + inode->i_uid = fattr->uid; + inode->i_gid = fattr->gid; - if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { + if (fattr->valid & (NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4)) { /* * report the blocks in 512byte units */ inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used); + } else { + inode->i_blocks = fattr->du.nfs2.blocks; } - if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) - inode->i_blocks = fattr->du.nfs2.blocks; /* Update attrtimeo value if we're out of the unstable period */ if (invalid & NFS_INO_INVALID_ATTR) { @@ -1374,6 +1274,7 @@ static void init_once(void *foo) INIT_LIST_HEAD(&nfsi->access_cache_entry_lru); INIT_LIST_HEAD(&nfsi->access_cache_inode_lru); INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC); + nfsi->ncommit = 0; nfsi->npages = 0; atomic_set(&nfsi->silly_count, 1); INIT_HLIST_HEAD(&nfsi->silly_list); diff --git a/trunk/fs/nfs/internal.h b/trunk/fs/nfs/internal.h index 2041f68ff1cc..340ede8f608f 100644 --- a/trunk/fs/nfs/internal.h +++ b/trunk/fs/nfs/internal.h @@ -152,9 +152,6 @@ extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus); extern struct rpc_procinfo nfs4_procedures[]; #endif -/* proc.c */ -void nfs_close_context(struct nfs_open_context *ctx, int is_sync); - /* dir.c */ extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask); @@ -168,7 +165,6 @@ extern void nfs_clear_inode(struct inode *); extern void nfs4_clear_inode(struct inode *); #endif void nfs_zap_acl_cache(struct inode *inode); -extern int nfs_wait_bit_killable(void *word); /* super.c */ void nfs_parse_ip_address(char *, size_t, struct sockaddr *, size_t *); diff --git a/trunk/fs/nfs/nfs2xdr.c b/trunk/fs/nfs/nfs2xdr.c index c862c9340f9a..28bab67d1519 100644 --- a/trunk/fs/nfs/nfs2xdr.c +++ b/trunk/fs/nfs/nfs2xdr.c @@ -120,8 +120,8 @@ xdr_decode_time(__be32 *p, struct timespec *timep) static __be32 * xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr) { - u32 rdev, type; - type = ntohl(*p++); + u32 rdev; + fattr->type = (enum nfs_ftype) ntohl(*p++); fattr->mode = ntohl(*p++); fattr->nlink = ntohl(*p++); fattr->uid = ntohl(*p++); @@ -136,9 +136,10 @@ xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr) p = xdr_decode_time(p, &fattr->atime); p = xdr_decode_time(p, &fattr->mtime); p = xdr_decode_time(p, &fattr->ctime); - fattr->valid |= NFS_ATTR_FATTR_V2; + fattr->valid |= NFS_ATTR_FATTR; fattr->rdev = new_decode_dev(rdev); - if (type == NFCHR && rdev == NFS2_FIFO_DEV) { + if (fattr->type == NFCHR && rdev == NFS2_FIFO_DEV) { + fattr->type = NFFIFO; fattr->mode = (fattr->mode & ~S_IFMT) | S_IFIFO; fattr->rdev = 0; } diff --git a/trunk/fs/nfs/nfs3proc.c b/trunk/fs/nfs/nfs3proc.c index b82fe6847f14..c55be7a7679e 100644 --- a/trunk/fs/nfs/nfs3proc.c +++ b/trunk/fs/nfs/nfs3proc.c @@ -834,5 +834,4 @@ const struct nfs_rpc_ops nfs_v3_clientops = { .commit_done = nfs3_commit_done, .lock = nfs3_proc_lock, .clear_acl_cache = nfs3_forget_cached_acls, - .close_context = nfs_close_context, }; diff --git a/trunk/fs/nfs/nfs3xdr.c b/trunk/fs/nfs/nfs3xdr.c index e6a1932c7110..6cdeacffde46 100644 --- a/trunk/fs/nfs/nfs3xdr.c +++ b/trunk/fs/nfs/nfs3xdr.c @@ -91,15 +91,19 @@ /* * Map file type to S_IFMT bits */ -static const umode_t nfs_type2fmt[] = { - [NF3BAD] = 0, - [NF3REG] = S_IFREG, - [NF3DIR] = S_IFDIR, - [NF3BLK] = S_IFBLK, - [NF3CHR] = S_IFCHR, - [NF3LNK] = S_IFLNK, - [NF3SOCK] = S_IFSOCK, - [NF3FIFO] = S_IFIFO, +static struct { + unsigned int mode; + unsigned int nfs2type; +} nfs_type2fmt[] = { + { 0, NFNON }, + { S_IFREG, NFREG }, + { S_IFDIR, NFDIR }, + { S_IFBLK, NFBLK }, + { S_IFCHR, NFCHR }, + { S_IFLNK, NFLNK }, + { S_IFSOCK, NFSOCK }, + { S_IFIFO, NFFIFO }, + { 0, NFBAD } }; /* @@ -144,12 +148,13 @@ static __be32 * xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr) { unsigned int type, major, minor; - umode_t fmode; + int fmode; type = ntohl(*p++); - if (type > NF3FIFO) - type = NF3NON; - fmode = nfs_type2fmt[type]; + if (type >= NF3BAD) + type = NF3BAD; + fmode = nfs_type2fmt[type].mode; + fattr->type = nfs_type2fmt[type].nfs2type; fattr->mode = (ntohl(*p++) & ~S_IFMT) | fmode; fattr->nlink = ntohl(*p++); fattr->uid = ntohl(*p++); @@ -172,7 +177,7 @@ xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr) p = xdr_decode_time3(p, &fattr->ctime); /* Update the mode bits */ - fattr->valid |= NFS_ATTR_FATTR_V3; + fattr->valid |= (NFS_ATTR_FATTR | NFS_ATTR_FATTR_V3); return p; } @@ -228,9 +233,7 @@ xdr_decode_wcc_attr(__be32 *p, struct nfs_fattr *fattr) p = xdr_decode_hyper(p, &fattr->pre_size); p = xdr_decode_time3(p, &fattr->pre_mtime); p = xdr_decode_time3(p, &fattr->pre_ctime); - fattr->valid |= NFS_ATTR_FATTR_PRESIZE - | NFS_ATTR_FATTR_PREMTIME - | NFS_ATTR_FATTR_PRECTIME; + fattr->valid |= NFS_ATTR_WCC; return p; } diff --git a/trunk/fs/nfs/nfs4proc.c b/trunk/fs/nfs/nfs4proc.c index 97bacccff579..8dde84b988d9 100644 --- a/trunk/fs/nfs/nfs4proc.c +++ b/trunk/fs/nfs/nfs4proc.c @@ -193,6 +193,14 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent kunmap_atomic(start, KM_USER0); } +static int nfs4_wait_bit_killable(void *word) +{ + if (fatal_signal_pending(current)) + return -ERESTARTSYS; + schedule(); + return 0; +} + static int nfs4_wait_clnt_recover(struct nfs_client *clp) { int res; @@ -200,7 +208,7 @@ static int nfs4_wait_clnt_recover(struct nfs_client *clp) might_sleep(); res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING, - nfs_wait_bit_killable, TASK_KILLABLE); + nfs4_wait_bit_killable, TASK_KILLABLE); return res; } @@ -1431,7 +1439,7 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait) if (calldata->arg.seqid == NULL) goto out_free_calldata; calldata->arg.fmode = 0; - calldata->arg.bitmask = server->cache_consistency_bitmask; + calldata->arg.bitmask = server->attr_bitmask; calldata->res.fattr = &calldata->fattr; calldata->res.seqid = calldata->arg.seqid; calldata->res.server = server; @@ -1572,15 +1580,6 @@ nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, st return 0; } -void nfs4_close_context(struct nfs_open_context *ctx, int is_sync) -{ - if (ctx->state == NULL) - return; - if (is_sync) - nfs4_close_sync(&ctx->path, ctx->state, ctx->mode); - else - nfs4_close_state(&ctx->path, ctx->state, ctx->mode); -} static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle) { @@ -1601,9 +1600,6 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f server->caps |= NFS_CAP_HARDLINKS; if (res.has_symlinks != 0) server->caps |= NFS_CAP_SYMLINKS; - memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask)); - server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE; - server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY; server->acl_bitmask = res.acl_bitmask; } return status; @@ -2083,7 +2079,7 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir) struct nfs_removeargs *args = msg->rpc_argp; struct nfs_removeres *res = msg->rpc_resp; - args->bitmask = server->cache_consistency_bitmask; + args->bitmask = server->attr_bitmask; res->server = server; msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE]; } @@ -2327,7 +2323,7 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, .pages = &page, .pgbase = 0, .count = count, - .bitmask = NFS_SERVER(dentry->d_inode)->cache_consistency_bitmask, + .bitmask = NFS_SERVER(dentry->d_inode)->attr_bitmask, }; struct nfs4_readdir_res res; struct rpc_message msg = { @@ -2556,7 +2552,7 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag { struct nfs_server *server = NFS_SERVER(data->inode); - data->args.bitmask = server->cache_consistency_bitmask; + data->args.bitmask = server->attr_bitmask; data->res.server = server; data->timestamp = jiffies; @@ -2579,7 +2575,7 @@ static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_messa { struct nfs_server *server = NFS_SERVER(data->inode); - data->args.bitmask = server->cache_consistency_bitmask; + data->args.bitmask = server->attr_bitmask; data->res.server = server; msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT]; } @@ -3682,19 +3678,6 @@ ssize_t nfs4_listxattr(struct dentry *dentry, char *buf, size_t buflen) return len; } -static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr) -{ - if (!((fattr->valid & NFS_ATTR_FATTR_FILEID) && - (fattr->valid & NFS_ATTR_FATTR_FSID) && - (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL))) - return; - - fattr->valid |= NFS_ATTR_FATTR_TYPE | NFS_ATTR_FATTR_MODE | - NFS_ATTR_FATTR_NLINK; - fattr->mode = S_IFDIR | S_IRUGO | S_IXUGO; - fattr->nlink = 2; -} - int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, struct nfs4_fs_locations *fs_locations, struct page *page) { @@ -3721,7 +3704,6 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, fs_locations->server = server; fs_locations->nlocations = 0; status = rpc_call_sync(server->client, &msg, 0); - nfs_fixup_referral_attributes(&fs_locations->fattr); dprintk("%s: returned status = %d\n", __func__, status); return status; } @@ -3785,7 +3767,6 @@ const struct nfs_rpc_ops nfs_v4_clientops = { .commit_done = nfs4_commit_done, .lock = nfs4_proc_lock, .clear_acl_cache = nfs4_zap_acl_attr, - .close_context = nfs4_close_context, }; /* diff --git a/trunk/fs/nfs/nfs4state.c b/trunk/fs/nfs/nfs4state.c index 0298e909559f..2022fe47966f 100644 --- a/trunk/fs/nfs/nfs4state.c +++ b/trunk/fs/nfs/nfs4state.c @@ -62,14 +62,8 @@ static LIST_HEAD(nfs4_clientid_list); static int nfs4_init_client(struct nfs_client *clp, struct rpc_cred *cred) { - unsigned short port; - int status; - - port = nfs_callback_tcpport; - if (clp->cl_addr.ss_family == AF_INET6) - port = nfs_callback_tcpport6; - - status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred); + int status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, + nfs_callback_tcpport, cred); if (status == 0) status = nfs4_proc_setclientid_confirm(clp, cred); if (status == 0) diff --git a/trunk/fs/nfs/nfs4xdr.c b/trunk/fs/nfs/nfs4xdr.c index 1690f0e44b91..d1e4c8f8a0a9 100644 --- a/trunk/fs/nfs/nfs4xdr.c +++ b/trunk/fs/nfs/nfs4xdr.c @@ -522,17 +522,20 @@ static int nfs4_stat_to_errno(int); decode_lookup_maxsz + \ decode_fs_locations_maxsz) -static const umode_t nfs_type2fmt[] = { - [NF4BAD] = 0, - [NF4REG] = S_IFREG, - [NF4DIR] = S_IFDIR, - [NF4BLK] = S_IFBLK, - [NF4CHR] = S_IFCHR, - [NF4LNK] = S_IFLNK, - [NF4SOCK] = S_IFSOCK, - [NF4FIFO] = S_IFIFO, - [NF4ATTRDIR] = 0, - [NF4NAMEDATTR] = 0, +static struct { + unsigned int mode; + unsigned int nfs2type; +} nfs_type2fmt[] = { + { 0, NFNON }, + { S_IFREG, NFREG }, + { S_IFDIR, NFDIR }, + { S_IFBLK, NFBLK }, + { S_IFCHR, NFCHR }, + { S_IFLNK, NFLNK }, + { S_IFSOCK, NFSOCK }, + { S_IFIFO, NFFIFO }, + { 0, NFNON }, + { 0, NFNON }, }; struct compound_hdr { @@ -2157,7 +2160,6 @@ static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint3 static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *type) { __be32 *p; - int ret = 0; *type = 0; if (unlikely(bitmap[0] & (FATTR4_WORD0_TYPE - 1U))) @@ -2170,16 +2172,14 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t * return -EIO; } bitmap[0] &= ~FATTR4_WORD0_TYPE; - ret = NFS_ATTR_FATTR_TYPE; } - dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type]); - return ret; + dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type].nfs2type); + return 0; } static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change) { __be32 *p; - int ret = 0; *change = 0; if (unlikely(bitmap[0] & (FATTR4_WORD0_CHANGE - 1U))) @@ -2188,17 +2188,15 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t READ_BUF(8); READ64(*change); bitmap[0] &= ~FATTR4_WORD0_CHANGE; - ret = NFS_ATTR_FATTR_CHANGE; } dprintk("%s: change attribute=%Lu\n", __func__, (unsigned long long)*change); - return ret; + return 0; } static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *size) { __be32 *p; - int ret = 0; *size = 0; if (unlikely(bitmap[0] & (FATTR4_WORD0_SIZE - 1U))) @@ -2207,10 +2205,9 @@ static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t * READ_BUF(8); READ64(*size); bitmap[0] &= ~FATTR4_WORD0_SIZE; - ret = NFS_ATTR_FATTR_SIZE; } dprintk("%s: file size=%Lu\n", __func__, (unsigned long long)*size); - return ret; + return 0; } static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -2248,7 +2245,6 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap, static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid) { __be32 *p; - int ret = 0; fsid->major = 0; fsid->minor = 0; @@ -2259,12 +2255,11 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs READ64(fsid->major); READ64(fsid->minor); bitmap[0] &= ~FATTR4_WORD0_FSID; - ret = NFS_ATTR_FATTR_FSID; } dprintk("%s: fsid=(0x%Lx/0x%Lx)\n", __func__, (unsigned long long)fsid->major, (unsigned long long)fsid->minor); - return ret; + return 0; } static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -2302,7 +2297,6 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid) { __be32 *p; - int ret = 0; *fileid = 0; if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEID - 1U))) @@ -2311,16 +2305,14 @@ static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t READ_BUF(8); READ64(*fileid); bitmap[0] &= ~FATTR4_WORD0_FILEID; - ret = NFS_ATTR_FATTR_FILEID; } dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid); - return ret; + return 0; } static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid) { __be32 *p; - int ret = 0; *fileid = 0; if (unlikely(bitmap[1] & (FATTR4_WORD1_MOUNTED_ON_FILEID - 1U))) @@ -2329,10 +2321,9 @@ static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitma READ_BUF(8); READ64(*fileid); bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; - ret = NFS_ATTR_FATTR_FILEID; } dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid); - return ret; + return 0; } static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -2488,8 +2479,6 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st if (res->nlocations < NFS4_FS_LOCATIONS_MAXENTRIES) res->nlocations++; } - if (res->nlocations != 0) - status = NFS_ATTR_FATTR_V4_REFERRAL; out: dprintk("%s: fs_locations done, error = %d\n", __func__, status); return status; @@ -2591,30 +2580,26 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32 return status; } -static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *mode) +static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *mode) { - uint32_t tmp; __be32 *p; - int ret = 0; *mode = 0; if (unlikely(bitmap[1] & (FATTR4_WORD1_MODE - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_MODE)) { READ_BUF(4); - READ32(tmp); - *mode = tmp & ~S_IFMT; + READ32(*mode); + *mode &= ~S_IFMT; bitmap[1] &= ~FATTR4_WORD1_MODE; - ret = NFS_ATTR_FATTR_MODE; } dprintk("%s: file mode=0%o\n", __func__, (unsigned int)*mode); - return ret; + return 0; } static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *nlink) { __be32 *p; - int ret = 0; *nlink = 1; if (unlikely(bitmap[1] & (FATTR4_WORD1_NUMLINKS - 1U))) @@ -2623,17 +2608,15 @@ static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t READ_BUF(4); READ32(*nlink); bitmap[1] &= ~FATTR4_WORD1_NUMLINKS; - ret = NFS_ATTR_FATTR_NLINK; } dprintk("%s: nlink=%u\n", __func__, (unsigned int)*nlink); - return ret; + return 0; } static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *uid) { uint32_t len; __be32 *p; - int ret = 0; *uid = -2; if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER - 1U))) @@ -2643,9 +2626,7 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf READ32(len); READ_BUF(len); if (len < XDR_MAX_NETOBJ) { - if (nfs_map_name_to_uid(clp, (char *)p, len, uid) == 0) - ret = NFS_ATTR_FATTR_OWNER; - else + if (nfs_map_name_to_uid(clp, (char *)p, len, uid) != 0) dprintk("%s: nfs_map_name_to_uid failed!\n", __func__); } else @@ -2654,14 +2635,13 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf bitmap[1] &= ~FATTR4_WORD1_OWNER; } dprintk("%s: uid=%d\n", __func__, (int)*uid); - return ret; + return 0; } static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *gid) { uint32_t len; __be32 *p; - int ret = 0; *gid = -2; if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER_GROUP - 1U))) @@ -2671,9 +2651,7 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf READ32(len); READ_BUF(len); if (len < XDR_MAX_NETOBJ) { - if (nfs_map_group_to_gid(clp, (char *)p, len, gid) == 0) - ret = NFS_ATTR_FATTR_GROUP; - else + if (nfs_map_group_to_gid(clp, (char *)p, len, gid) != 0) dprintk("%s: nfs_map_group_to_gid failed!\n", __func__); } else @@ -2682,14 +2660,13 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf bitmap[1] &= ~FATTR4_WORD1_OWNER_GROUP; } dprintk("%s: gid=%d\n", __func__, (int)*gid); - return ret; + return 0; } static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rdev) { uint32_t major = 0, minor = 0; __be32 *p; - int ret = 0; *rdev = MKDEV(0,0); if (unlikely(bitmap[1] & (FATTR4_WORD1_RAWDEV - 1U))) @@ -2704,10 +2681,9 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde if (MAJOR(tmp) == major && MINOR(tmp) == minor) *rdev = tmp; bitmap[1] &= ~ FATTR4_WORD1_RAWDEV; - ret = NFS_ATTR_FATTR_RDEV; } dprintk("%s: rdev=(0x%x:0x%x)\n", __func__, major, minor); - return ret; + return 0; } static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -2764,7 +2740,6 @@ static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uin static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *used) { __be32 *p; - int ret = 0; *used = 0; if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_USED - 1U))) @@ -2773,11 +2748,10 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint READ_BUF(8); READ64(*used); bitmap[1] &= ~FATTR4_WORD1_SPACE_USED; - ret = NFS_ATTR_FATTR_SPACE_USED; } dprintk("%s: space used=%Lu\n", __func__, (unsigned long long)*used); - return ret; + return 0; } static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time) @@ -2804,8 +2778,6 @@ static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, str return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_TIME_ACCESS)) { status = decode_attr_time(xdr, time); - if (status == 0) - status = NFS_ATTR_FATTR_ATIME; bitmap[1] &= ~FATTR4_WORD1_TIME_ACCESS; } dprintk("%s: atime=%ld\n", __func__, (long)time->tv_sec); @@ -2822,8 +2794,6 @@ static int decode_attr_time_metadata(struct xdr_stream *xdr, uint32_t *bitmap, s return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_TIME_METADATA)) { status = decode_attr_time(xdr, time); - if (status == 0) - status = NFS_ATTR_FATTR_CTIME; bitmap[1] &= ~FATTR4_WORD1_TIME_METADATA; } dprintk("%s: ctime=%ld\n", __func__, (long)time->tv_sec); @@ -2840,8 +2810,6 @@ static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, str return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_TIME_MODIFY)) { status = decode_attr_time(xdr, time); - if (status == 0) - status = NFS_ATTR_FATTR_MTIME; bitmap[1] &= ~FATTR4_WORD1_TIME_MODIFY; } dprintk("%s: mtime=%ld\n", __func__, (long)time->tv_sec); @@ -3026,116 +2994,63 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons uint32_t attrlen, bitmap[2] = {0}, type; - int status; - umode_t fmode = 0; + int status, fmode = 0; uint64_t fileid; - status = decode_op_hdr(xdr, OP_GETATTR); - if (status < 0) + if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) goto xdr_error; - - status = decode_attr_bitmap(xdr, bitmap); - if (status < 0) + if ((status = decode_attr_bitmap(xdr, bitmap)) != 0) goto xdr_error; - status = decode_attr_length(xdr, &attrlen, &savep); - if (status < 0) + fattr->bitmap[0] = bitmap[0]; + fattr->bitmap[1] = bitmap[1]; + + if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0) goto xdr_error; - status = decode_attr_type(xdr, bitmap, &type); - if (status < 0) + if ((status = decode_attr_type(xdr, bitmap, &type)) != 0) goto xdr_error; - fattr->mode = 0; - if (status != 0) { - fattr->mode |= nfs_type2fmt[type]; - fattr->valid |= status; - } + fattr->type = nfs_type2fmt[type].nfs2type; + fmode = nfs_type2fmt[type].mode; - status = decode_attr_change(xdr, bitmap, &fattr->change_attr); - if (status < 0) + if ((status = decode_attr_change(xdr, bitmap, &fattr->change_attr)) != 0) goto xdr_error; - fattr->valid |= status; - - status = decode_attr_size(xdr, bitmap, &fattr->size); - if (status < 0) + if ((status = decode_attr_size(xdr, bitmap, &fattr->size)) != 0) goto xdr_error; - fattr->valid |= status; - - status = decode_attr_fsid(xdr, bitmap, &fattr->fsid); - if (status < 0) + if ((status = decode_attr_fsid(xdr, bitmap, &fattr->fsid)) != 0) goto xdr_error; - fattr->valid |= status; - - status = decode_attr_fileid(xdr, bitmap, &fattr->fileid); - if (status < 0) + if ((status = decode_attr_fileid(xdr, bitmap, &fattr->fileid)) != 0) goto xdr_error; - fattr->valid |= status; - - status = decode_attr_fs_locations(xdr, bitmap, container_of(fattr, + if ((status = decode_attr_fs_locations(xdr, bitmap, container_of(fattr, struct nfs4_fs_locations, - fattr)); - if (status < 0) + fattr))) != 0) goto xdr_error; - fattr->valid |= status; - - status = decode_attr_mode(xdr, bitmap, &fmode); - if (status < 0) + if ((status = decode_attr_mode(xdr, bitmap, &fattr->mode)) != 0) goto xdr_error; - if (status != 0) { - fattr->mode |= fmode; - fattr->valid |= status; - } - - status = decode_attr_nlink(xdr, bitmap, &fattr->nlink); - if (status < 0) + fattr->mode |= fmode; + if ((status = decode_attr_nlink(xdr, bitmap, &fattr->nlink)) != 0) goto xdr_error; - fattr->valid |= status; - - status = decode_attr_owner(xdr, bitmap, server->nfs_client, &fattr->uid); - if (status < 0) + if ((status = decode_attr_owner(xdr, bitmap, server->nfs_client, &fattr->uid)) != 0) goto xdr_error; - fattr->valid |= status; - - status = decode_attr_group(xdr, bitmap, server->nfs_client, &fattr->gid); - if (status < 0) + if ((status = decode_attr_group(xdr, bitmap, server->nfs_client, &fattr->gid)) != 0) goto xdr_error; - fattr->valid |= status; - - status = decode_attr_rdev(xdr, bitmap, &fattr->rdev); - if (status < 0) + if ((status = decode_attr_rdev(xdr, bitmap, &fattr->rdev)) != 0) goto xdr_error; - fattr->valid |= status; - - status = decode_attr_space_used(xdr, bitmap, &fattr->du.nfs3.used); - if (status < 0) + if ((status = decode_attr_space_used(xdr, bitmap, &fattr->du.nfs3.used)) != 0) goto xdr_error; - fattr->valid |= status; - - status = decode_attr_time_access(xdr, bitmap, &fattr->atime); - if (status < 0) + if ((status = decode_attr_time_access(xdr, bitmap, &fattr->atime)) != 0) goto xdr_error; - fattr->valid |= status; - - status = decode_attr_time_metadata(xdr, bitmap, &fattr->ctime); - if (status < 0) + if ((status = decode_attr_time_metadata(xdr, bitmap, &fattr->ctime)) != 0) goto xdr_error; - fattr->valid |= status; - - status = decode_attr_time_modify(xdr, bitmap, &fattr->mtime); - if (status < 0) + if ((status = decode_attr_time_modify(xdr, bitmap, &fattr->mtime)) != 0) goto xdr_error; - fattr->valid |= status; - - status = decode_attr_mounted_on_fileid(xdr, bitmap, &fileid); - if (status < 0) + if ((status = decode_attr_mounted_on_fileid(xdr, bitmap, &fileid)) != 0) goto xdr_error; - if (status != 0 && !(fattr->valid & status)) { + if (fattr->fileid == 0 && fileid != 0) fattr->fileid = fileid; - fattr->valid |= status; - } - - status = verify_attr_len(xdr, savep, attrlen); + if ((status = verify_attr_len(xdr, savep, attrlen)) == 0) + fattr->valid = NFS_ATTR_FATTR | NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4; xdr_error: dprintk("%s: xdr returned %d\n", __func__, -status); return status; @@ -4163,7 +4078,9 @@ static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_se status = decode_setattr(&xdr, res); if (status) goto out; - decode_getfattr(&xdr, res->fattr, res->server); + status = decode_getfattr(&xdr, res->fattr, res->server); + if (status == NFS4ERR_DELAY) + status = 0; out: return status; } diff --git a/trunk/fs/nfs/pagelist.c b/trunk/fs/nfs/pagelist.c index e2975939126a..7f079209d70a 100644 --- a/trunk/fs/nfs/pagelist.c +++ b/trunk/fs/nfs/pagelist.c @@ -176,6 +176,17 @@ void nfs_release_request(struct nfs_page *req) kref_put(&req->wb_kref, nfs_free_request); } +static int nfs_wait_bit_killable(void *word) +{ + int ret = 0; + + if (fatal_signal_pending(current)) + ret = -ERESTARTSYS; + else + schedule(); + return ret; +} + /** * nfs_wait_on_request - Wait for a request to complete. * @req: request to wait upon. diff --git a/trunk/fs/nfs/proc.c b/trunk/fs/nfs/proc.c index 7be72d90d49d..193465210d7c 100644 --- a/trunk/fs/nfs/proc.c +++ b/trunk/fs/nfs/proc.c @@ -663,5 +663,4 @@ const struct nfs_rpc_ops nfs_v2_clientops = { .commit_setup = nfs_proc_commit_setup, .lock = nfs_proc_lock, .lock_check_bounds = nfs_lock_check_bounds, - .close_context = nfs_close_context, }; diff --git a/trunk/fs/nfs/super.c b/trunk/fs/nfs/super.c index 0942fcbbad3c..d6686f4786dc 100644 --- a/trunk/fs/nfs/super.c +++ b/trunk/fs/nfs/super.c @@ -1018,7 +1018,6 @@ static int nfs_parse_mount_options(char *raw, case Opt_rdma: mnt->flags |= NFS_MOUNT_TCP; /* for side protocols */ mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA; - xprt_load_transport(p); break; case Opt_acl: mnt->flags &= ~NFS_MOUNT_NOACL; @@ -1206,14 +1205,12 @@ static int nfs_parse_mount_options(char *raw, /* vector side protocols to TCP */ mnt->flags |= NFS_MOUNT_TCP; mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA; - xprt_load_transport(string); break; default: errors++; dfprintk(MOUNT, "NFS: unrecognized " "transport protocol\n"); } - kfree(string); break; case Opt_mountproto: string = match_strdup(args); @@ -1221,6 +1218,7 @@ static int nfs_parse_mount_options(char *raw, goto out_nomem; token = match_token(string, nfs_xprt_protocol_tokens, args); + kfree(string); switch (token) { case Opt_xprt_udp: diff --git a/trunk/fs/nfs/write.c b/trunk/fs/nfs/write.c index e560a78995a3..9f9845859fc1 100644 --- a/trunk/fs/nfs/write.c +++ b/trunk/fs/nfs/write.c @@ -313,34 +313,19 @@ static int nfs_writepages_callback(struct page *page, struct writeback_control * int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct inode *inode = mapping->host; - unsigned long *bitlock = &NFS_I(inode)->flags; struct nfs_pageio_descriptor pgio; int err; - /* Stop dirtying of new pages while we sync */ - err = wait_on_bit_lock(bitlock, NFS_INO_FLUSHING, - nfs_wait_bit_killable, TASK_KILLABLE); - if (err) - goto out_err; - nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); nfs_pageio_init_write(&pgio, inode, wb_priority(wbc)); err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio); nfs_pageio_complete(&pgio); - - clear_bit_unlock(NFS_INO_FLUSHING, bitlock); - smp_mb__after_clear_bit(); - wake_up_bit(bitlock, NFS_INO_FLUSHING); - if (err < 0) - goto out_err; - err = pgio.pg_error; - if (err < 0) - goto out_err; + return err; + if (pgio.pg_error < 0) + return pgio.pg_error; return 0; -out_err: - return err; } /* @@ -419,6 +404,7 @@ nfs_mark_request_commit(struct nfs_page *req) struct nfs_inode *nfsi = NFS_I(inode); spin_lock(&inode->i_lock); + nfsi->ncommit++; set_bit(PG_CLEAN, &(req)->wb_flags); radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, @@ -538,12 +524,6 @@ static void nfs_cancel_commit_list(struct list_head *head) } #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) -static int -nfs_need_commit(struct nfs_inode *nfsi) -{ - return radix_tree_tagged(&nfsi->nfs_page_tree, NFS_PAGE_TAG_COMMIT); -} - /* * nfs_scan_commit - Scan an inode for commit requests * @inode: NFS inode to scan @@ -558,18 +538,16 @@ static int nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages) { struct nfs_inode *nfsi = NFS_I(inode); + int res = 0; - if (!nfs_need_commit(nfsi)) - return 0; - - return nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT); + if (nfsi->ncommit != 0) { + res = nfs_scan_list(nfsi, dst, idx_start, npages, + NFS_PAGE_TAG_COMMIT); + nfsi->ncommit -= res; + } + return res; } #else -static inline int nfs_need_commit(struct nfs_inode *nfsi) -{ - return 0; -} - static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages) { return 0; @@ -842,7 +820,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req, data->args.stable = NFS_UNSTABLE; if (how & FLUSH_STABLE) { data->args.stable = NFS_DATA_SYNC; - if (!nfs_need_commit(NFS_I(inode))) + if (!NFS_I(inode)->ncommit) data->args.stable = NFS_FILE_SYNC; } @@ -1447,13 +1425,18 @@ static int nfs_write_mapping(struct address_space *mapping, int how) { struct writeback_control wbc = { .bdi = mapping->backing_dev_info, - .sync_mode = WB_SYNC_ALL, + .sync_mode = WB_SYNC_NONE, .nr_to_write = LONG_MAX, .range_start = 0, .range_end = LLONG_MAX, .for_writepages = 1, }; + int ret; + ret = __nfs_write_mapping(mapping, &wbc, how); + if (ret < 0) + return ret; + wbc.sync_mode = WB_SYNC_ALL; return __nfs_write_mapping(mapping, &wbc, how); } diff --git a/trunk/fs/nfsd/nfsctl.c b/trunk/fs/nfsd/nfsctl.c index a4ed8644d69c..3d93b2064ce5 100644 --- a/trunk/fs/nfsd/nfsctl.c +++ b/trunk/fs/nfsd/nfsctl.c @@ -938,12 +938,10 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size) char transport[16]; int port; if (sscanf(buf, "%15s %4d", transport, &port) == 2) { - if (port < 1 || port > 65535) - return -EINVAL; err = nfsd_create_serv(); if (!err) { err = svc_create_xprt(nfsd_serv, - transport, PF_INET, port, + transport, port, SVC_SOCK_ANONYMOUS); if (err == -ENOENT) /* Give a reasonable perror msg for @@ -962,7 +960,7 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size) char transport[16]; int port; if (sscanf(&buf[1], "%15s %4d", transport, &port) == 2) { - if (port < 1 || port > 65535) + if (port == 0) return -EINVAL; if (nfsd_serv) { xprt = svc_find_xprt(nfsd_serv, transport, diff --git a/trunk/fs/nfsd/nfssvc.c b/trunk/fs/nfsd/nfssvc.c index bc3567bab8c4..07e4f5d7baa8 100644 --- a/trunk/fs/nfsd/nfssvc.c +++ b/trunk/fs/nfsd/nfssvc.c @@ -229,6 +229,7 @@ int nfsd_create_serv(void) atomic_set(&nfsd_busy, 0); nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, + AF_INET, nfsd_last_thread, nfsd, THIS_MODULE); if (nfsd_serv == NULL) err = -ENOMEM; @@ -243,7 +244,7 @@ static int nfsd_init_socks(int port) if (!list_empty(&nfsd_serv->sv_permsocks)) return 0; - error = svc_create_xprt(nfsd_serv, "udp", PF_INET, port, + error = svc_create_xprt(nfsd_serv, "udp", port, SVC_SOCK_DEFAULTS); if (error < 0) return error; @@ -252,7 +253,7 @@ static int nfsd_init_socks(int port) if (error < 0) return error; - error = svc_create_xprt(nfsd_serv, "tcp", PF_INET, port, + error = svc_create_xprt(nfsd_serv, "tcp", port, SVC_SOCK_DEFAULTS); if (error < 0) return error; diff --git a/trunk/include/drm/drm_crtc_helper.h b/trunk/include/drm/drm_crtc_helper.h index ec073d8288d9..c7d4b2e606a5 100644 --- a/trunk/include/drm/drm_crtc_helper.h +++ b/trunk/include/drm/drm_crtc_helper.h @@ -33,6 +33,7 @@ #ifndef __DRM_CRTC_HELPER_H__ #define __DRM_CRTC_HELPER_H__ +#include #include #include #include @@ -91,7 +92,7 @@ struct drm_connector_helper_funcs { extern int drm_helper_probe_single_connector_modes(struct drm_connector *connector, uint32_t maxX, uint32_t maxY); extern void drm_helper_disable_unused_functions(struct drm_device *dev); extern int drm_helper_hotplug_stage_two(struct drm_device *dev); -extern bool drm_helper_initial_config(struct drm_device *dev); +extern bool drm_helper_initial_config(struct drm_device *dev, bool can_grow); extern int drm_crtc_helper_set_config(struct drm_mode_set *set); extern bool drm_crtc_helper_set_mode(struct drm_crtc *crtc, struct drm_display_mode *mode, diff --git a/trunk/include/drm/drm_os_linux.h b/trunk/include/drm/drm_os_linux.h index 26641e95e0a4..013551d03c03 100644 --- a/trunk/include/drm/drm_os_linux.h +++ b/trunk/include/drm/drm_os_linux.h @@ -7,12 +7,12 @@ #include #ifndef readq -static inline u64 readq(void __iomem *reg) +static u64 readq(void __iomem *reg) { return ((u64) readl(reg)) | (((u64) readl(reg + 4UL)) << 32); } -static inline void writeq(u64 val, void __iomem *reg) +static void writeq(u64 val, void __iomem *reg) { writel(val & 0xffffffff, reg); writel(val >> 32, reg + 0x4UL); diff --git a/trunk/include/linux/jbd2.h b/trunk/include/linux/jbd2.h index 8815a3456b3b..4d248b3f1323 100644 --- a/trunk/include/linux/jbd2.h +++ b/trunk/include/linux/jbd2.h @@ -648,12 +648,6 @@ struct transaction_s */ int t_handle_count; - /* - * This transaction is being forced and some process is - * waiting for it to finish. - */ - int t_synchronous_commit:1; - /* * For use by the filesystem to store fs-specific data * structures associated with the transaction diff --git a/trunk/include/linux/nfs_fs.h b/trunk/include/linux/nfs_fs.h index bde2557c2a9c..8cc8807f77d6 100644 --- a/trunk/include/linux/nfs_fs.h +++ b/trunk/include/linux/nfs_fs.h @@ -166,7 +166,8 @@ struct nfs_inode { */ struct radix_tree_root nfs_page_tree; - unsigned long npages; + unsigned long ncommit, + npages; /* Open contexts for shared mmap writes */ struct list_head open_files; @@ -206,7 +207,6 @@ struct nfs_inode { #define NFS_INO_STALE (1) /* possible stale inode */ #define NFS_INO_ACL_LRU_SET (2) /* Inode is on the LRU list */ #define NFS_INO_MOUNTPOINT (3) /* inode is remote mountpoint */ -#define NFS_INO_FLUSHING (4) /* inode is flushing out data */ static inline struct nfs_inode *NFS_I(const struct inode *inode) { diff --git a/trunk/include/linux/nfs_fs_sb.h b/trunk/include/linux/nfs_fs_sb.h index 29b1e40dce99..9bb81aec91cf 100644 --- a/trunk/include/linux/nfs_fs_sb.h +++ b/trunk/include/linux/nfs_fs_sb.h @@ -106,11 +106,6 @@ struct nfs_server { u32 attr_bitmask[2];/* V4 bitmask representing the set of attributes supported on this filesystem */ - u32 cache_consistency_bitmask[2]; - /* V4 bitmask representing the subset - of change attribute, size, ctime - and mtime attributes supported by - the server */ u32 acl_bitmask; /* V4 bitmask representing the ACEs that are supported on this filesystem */ diff --git a/trunk/include/linux/nfs_xdr.h b/trunk/include/linux/nfs_xdr.h index b89c34e40bc2..43a713fce11c 100644 --- a/trunk/include/linux/nfs_xdr.h +++ b/trunk/include/linux/nfs_xdr.h @@ -27,8 +27,12 @@ static inline int nfs_fsid_equal(const struct nfs_fsid *a, const struct nfs_fsid } struct nfs_fattr { - unsigned int valid; /* which fields are valid */ - umode_t mode; + unsigned short valid; /* which fields are valid */ + __u64 pre_size; /* pre_op_attr.size */ + struct timespec pre_mtime; /* pre_op_attr.mtime */ + struct timespec pre_ctime; /* pre_op_attr.ctime */ + enum nfs_ftype type; /* always use NFSv2 types */ + __u32 mode; __u32 nlink; __u32 uid; __u32 gid; @@ -48,55 +52,19 @@ struct nfs_fattr { struct timespec atime; struct timespec mtime; struct timespec ctime; + __u32 bitmap[2]; /* NFSv4 returned attribute bitmap */ __u64 change_attr; /* NFSv4 change attribute */ __u64 pre_change_attr;/* pre-op NFSv4 change attribute */ - __u64 pre_size; /* pre_op_attr.size */ - struct timespec pre_mtime; /* pre_op_attr.mtime */ - struct timespec pre_ctime; /* pre_op_attr.ctime */ unsigned long time_start; unsigned long gencount; }; -#define NFS_ATTR_FATTR_TYPE (1U << 0) -#define NFS_ATTR_FATTR_MODE (1U << 1) -#define NFS_ATTR_FATTR_NLINK (1U << 2) -#define NFS_ATTR_FATTR_OWNER (1U << 3) -#define NFS_ATTR_FATTR_GROUP (1U << 4) -#define NFS_ATTR_FATTR_RDEV (1U << 5) -#define NFS_ATTR_FATTR_SIZE (1U << 6) -#define NFS_ATTR_FATTR_PRESIZE (1U << 7) -#define NFS_ATTR_FATTR_BLOCKS_USED (1U << 8) -#define NFS_ATTR_FATTR_SPACE_USED (1U << 9) -#define NFS_ATTR_FATTR_FSID (1U << 10) -#define NFS_ATTR_FATTR_FILEID (1U << 11) -#define NFS_ATTR_FATTR_ATIME (1U << 12) -#define NFS_ATTR_FATTR_MTIME (1U << 13) -#define NFS_ATTR_FATTR_CTIME (1U << 14) -#define NFS_ATTR_FATTR_PREMTIME (1U << 15) -#define NFS_ATTR_FATTR_PRECTIME (1U << 16) -#define NFS_ATTR_FATTR_CHANGE (1U << 17) -#define NFS_ATTR_FATTR_PRECHANGE (1U << 18) -#define NFS_ATTR_FATTR_V4_REFERRAL (1U << 19) /* NFSv4 referral */ - -#define NFS_ATTR_FATTR (NFS_ATTR_FATTR_TYPE \ - | NFS_ATTR_FATTR_MODE \ - | NFS_ATTR_FATTR_NLINK \ - | NFS_ATTR_FATTR_OWNER \ - | NFS_ATTR_FATTR_GROUP \ - | NFS_ATTR_FATTR_RDEV \ - | NFS_ATTR_FATTR_SIZE \ - | NFS_ATTR_FATTR_FSID \ - | NFS_ATTR_FATTR_FILEID \ - | NFS_ATTR_FATTR_ATIME \ - | NFS_ATTR_FATTR_MTIME \ - | NFS_ATTR_FATTR_CTIME) -#define NFS_ATTR_FATTR_V2 (NFS_ATTR_FATTR \ - | NFS_ATTR_FATTR_BLOCKS_USED) -#define NFS_ATTR_FATTR_V3 (NFS_ATTR_FATTR \ - | NFS_ATTR_FATTR_SPACE_USED) -#define NFS_ATTR_FATTR_V4 (NFS_ATTR_FATTR \ - | NFS_ATTR_FATTR_SPACE_USED \ - | NFS_ATTR_FATTR_CHANGE) +#define NFS_ATTR_WCC 0x0001 /* pre-op WCC data */ +#define NFS_ATTR_FATTR 0x0002 /* post-op attributes */ +#define NFS_ATTR_FATTR_V3 0x0004 /* NFSv3 attributes */ +#define NFS_ATTR_FATTR_V4 0x0008 /* NFSv4 change attribute */ +#define NFS_ATTR_WCC_V4 0x0010 /* pre-op change attribute */ +#define NFS_ATTR_FATTR_V4_REFERRAL 0x0020 /* NFSv4 referral */ /* * Info on the file system @@ -868,7 +836,6 @@ struct nfs_rpc_ops { int (*lock)(struct file *, int, struct file_lock *); int (*lock_check_bounds)(const struct file_lock *); void (*clear_acl_cache)(struct inode *); - void (*close_context)(struct nfs_open_context *ctx, int); }; /* diff --git a/trunk/include/linux/sunrpc/svc.h b/trunk/include/linux/sunrpc/svc.h index d3a4c0231933..3435d24bfe55 100644 --- a/trunk/include/linux/sunrpc/svc.h +++ b/trunk/include/linux/sunrpc/svc.h @@ -69,6 +69,7 @@ struct svc_serv { struct list_head sv_tempsocks; /* all temporary sockets */ int sv_tmpcnt; /* count of temporary sockets */ struct timer_list sv_temptimer; /* timer for aging temporary sockets */ + sa_family_t sv_family; /* listener's address family */ char * sv_name; /* service name */ @@ -384,19 +385,19 @@ struct svc_procedure { /* * Function prototypes. */ -struct svc_serv *svc_create(struct svc_program *, unsigned int, +struct svc_serv *svc_create(struct svc_program *, unsigned int, sa_family_t, void (*shutdown)(struct svc_serv *)); struct svc_rqst *svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool); void svc_exit_thread(struct svc_rqst *); struct svc_serv * svc_create_pooled(struct svc_program *, unsigned int, - void (*shutdown)(struct svc_serv *), + sa_family_t, void (*shutdown)(struct svc_serv *), svc_thread_fn, struct module *); int svc_set_num_threads(struct svc_serv *, struct svc_pool *, int); void svc_destroy(struct svc_serv *); int svc_process(struct svc_rqst *); -int svc_register(const struct svc_serv *, const int, - const unsigned short, const unsigned short); +int svc_register(const struct svc_serv *, const unsigned short, + const unsigned short); void svc_wake_up(struct svc_serv *); void svc_reserve(struct svc_rqst *rqstp, int space); diff --git a/trunk/include/linux/sunrpc/svc_xprt.h b/trunk/include/linux/sunrpc/svc_xprt.h index 0d9cb6ef28b0..0127daca4354 100644 --- a/trunk/include/linux/sunrpc/svc_xprt.h +++ b/trunk/include/linux/sunrpc/svc_xprt.h @@ -71,8 +71,7 @@ int svc_reg_xprt_class(struct svc_xprt_class *); void svc_unreg_xprt_class(struct svc_xprt_class *); void svc_xprt_init(struct svc_xprt_class *, struct svc_xprt *, struct svc_serv *); -int svc_create_xprt(struct svc_serv *, const char *, const int, - const unsigned short, int); +int svc_create_xprt(struct svc_serv *, char *, unsigned short, int); void svc_xprt_enqueue(struct svc_xprt *xprt); void svc_xprt_received(struct svc_xprt *); void svc_xprt_put(struct svc_xprt *xprt); @@ -81,8 +80,7 @@ void svc_close_xprt(struct svc_xprt *xprt); void svc_delete_xprt(struct svc_xprt *xprt); int svc_port_is_privileged(struct sockaddr *sin); int svc_print_xprts(char *buf, int maxlen); -struct svc_xprt *svc_find_xprt(struct svc_serv *serv, const char *xcl_name, - const sa_family_t af, const unsigned short port); +struct svc_xprt *svc_find_xprt(struct svc_serv *, char *, int, int); int svc_xprt_names(struct svc_serv *serv, char *buf, int buflen); static inline void svc_xprt_get(struct svc_xprt *xprt) @@ -90,32 +88,29 @@ static inline void svc_xprt_get(struct svc_xprt *xprt) kref_get(&xprt->xpt_ref); } static inline void svc_xprt_set_local(struct svc_xprt *xprt, - const struct sockaddr *sa, - const size_t salen) + struct sockaddr *sa, int salen) { memcpy(&xprt->xpt_local, sa, salen); xprt->xpt_locallen = salen; } static inline void svc_xprt_set_remote(struct svc_xprt *xprt, - const struct sockaddr *sa, - const size_t salen) + struct sockaddr *sa, int salen) { memcpy(&xprt->xpt_remote, sa, salen); xprt->xpt_remotelen = salen; } -static inline unsigned short svc_addr_port(const struct sockaddr *sa) +static inline unsigned short svc_addr_port(struct sockaddr *sa) { - const struct sockaddr_in *sin = (const struct sockaddr_in *)sa; - const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)sa; - + unsigned short ret = 0; switch (sa->sa_family) { case AF_INET: - return ntohs(sin->sin_port); + ret = ntohs(((struct sockaddr_in *)sa)->sin_port); + break; case AF_INET6: - return ntohs(sin6->sin6_port); + ret = ntohs(((struct sockaddr_in6 *)sa)->sin6_port); + break; } - - return 0; + return ret; } static inline size_t svc_addr_len(struct sockaddr *sa) @@ -129,39 +124,36 @@ static inline size_t svc_addr_len(struct sockaddr *sa) return -EAFNOSUPPORT; } -static inline unsigned short svc_xprt_local_port(const struct svc_xprt *xprt) +static inline unsigned short svc_xprt_local_port(struct svc_xprt *xprt) { - return svc_addr_port((const struct sockaddr *)&xprt->xpt_local); + return svc_addr_port((struct sockaddr *)&xprt->xpt_local); } -static inline unsigned short svc_xprt_remote_port(const struct svc_xprt *xprt) +static inline unsigned short svc_xprt_remote_port(struct svc_xprt *xprt) { - return svc_addr_port((const struct sockaddr *)&xprt->xpt_remote); + return svc_addr_port((struct sockaddr *)&xprt->xpt_remote); } -static inline char *__svc_print_addr(const struct sockaddr *addr, - char *buf, const size_t len) +static inline char *__svc_print_addr(struct sockaddr *addr, + char *buf, size_t len) { - const struct sockaddr_in *sin = (const struct sockaddr_in *)addr; - const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)addr; - switch (addr->sa_family) { case AF_INET: - snprintf(buf, len, "%pI4, port=%u", &sin->sin_addr, - ntohs(sin->sin_port)); + snprintf(buf, len, "%pI4, port=%u", + &((struct sockaddr_in *)addr)->sin_addr, + ntohs(((struct sockaddr_in *) addr)->sin_port)); break; case AF_INET6: snprintf(buf, len, "%pI6, port=%u", - &sin6->sin6_addr, - ntohs(sin6->sin6_port)); + &((struct sockaddr_in6 *)addr)->sin6_addr, + ntohs(((struct sockaddr_in6 *) addr)->sin6_port)); break; default: snprintf(buf, len, "unknown address type: %d", addr->sa_family); break; } - return buf; } #endif /* SUNRPC_SVC_XPRT_H */ diff --git a/trunk/include/linux/sunrpc/xprt.h b/trunk/include/linux/sunrpc/xprt.h index 1758d9f5b5c3..11fc71d50c1e 100644 --- a/trunk/include/linux/sunrpc/xprt.h +++ b/trunk/include/linux/sunrpc/xprt.h @@ -235,7 +235,6 @@ static inline __be32 *xprt_skip_transport_header(struct rpc_xprt *xprt, __be32 * */ int xprt_register_transport(struct xprt_class *type); int xprt_unregister_transport(struct xprt_class *type); -int xprt_load_transport(const char *); void xprt_set_retrans_timeout_def(struct rpc_task *task); void xprt_set_retrans_timeout_rtt(struct rpc_task *task); void xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status); @@ -260,7 +259,6 @@ void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie); #define XPRT_BOUND (4) #define XPRT_BINDING (5) #define XPRT_CLOSING (6) -#define XPRT_CONNECTION_ABORT (7) static inline void xprt_set_connected(struct rpc_xprt *xprt) { diff --git a/trunk/net/sunrpc/Kconfig b/trunk/net/sunrpc/Kconfig index afd91c78ce8e..5592883e1e4a 100644 --- a/trunk/net/sunrpc/Kconfig +++ b/trunk/net/sunrpc/Kconfig @@ -17,6 +17,28 @@ config SUNRPC_XPRT_RDMA If unsure, say N. +config SUNRPC_REGISTER_V4 + bool "Register local RPC services via rpcbind v4 (EXPERIMENTAL)" + depends on SUNRPC && EXPERIMENTAL + default n + help + Sun added support for registering RPC services at an IPv6 + address by creating two new versions of the rpcbind protocol + (RFC 1833). + + This option enables support in the kernel RPC server for + registering kernel RPC services via version 4 of the rpcbind + protocol. If you enable this option, you must run a portmapper + daemon that supports rpcbind protocol version 4. + + Serving NFS over IPv6 from knfsd (the kernel's NFS server) + requires that you enable this option and use a portmapper that + supports rpcbind version 4. + + If unsure, say N to get traditional behavior (register kernel + RPC services using only rpcbind version 2). Distributions + using the legacy Linux portmapper daemon must say N here. + config RPCSEC_GSS_KRB5 tristate "Secure RPC: Kerberos V mechanism (EXPERIMENTAL)" depends on SUNRPC && EXPERIMENTAL diff --git a/trunk/net/sunrpc/clnt.c b/trunk/net/sunrpc/clnt.c index 5abab094441f..836f15c0c4a3 100644 --- a/trunk/net/sunrpc/clnt.c +++ b/trunk/net/sunrpc/clnt.c @@ -1032,20 +1032,27 @@ call_connect_status(struct rpc_task *task) dprint_status(task); task->tk_status = 0; - if (status >= 0 || status == -EAGAIN) { + if (status >= 0) { clnt->cl_stats->netreconn++; task->tk_action = call_transmit; return; } + /* Something failed: remote service port may have changed */ + rpc_force_rebind(clnt); + switch (status) { + case -ENOTCONN: + case -EAGAIN: + task->tk_action = call_bind; + if (!RPC_IS_SOFT(task)) + return; /* if soft mounted, test if we've timed out */ case -ETIMEDOUT: task->tk_action = call_timeout; - break; - default: - rpc_exit(task, -EIO); + return; } + rpc_exit(task, -EIO); } /* @@ -1098,26 +1105,14 @@ static void call_transmit_status(struct rpc_task *task) { task->tk_action = call_status; - switch (task->tk_status) { - case -EAGAIN: - break; - default: - xprt_end_transmit(task); - /* - * Special cases: if we've been waiting on the - * socket's write_space() callback, or if the - * socket just returned a connection error, - * then hold onto the transport lock. - */ - case -ECONNREFUSED: - case -ECONNRESET: - case -ENOTCONN: - case -EHOSTDOWN: - case -EHOSTUNREACH: - case -ENETUNREACH: - case -EPIPE: - rpc_task_force_reencode(task); - } + /* + * Special case: if we've been waiting on the socket's write_space() + * callback, then don't call xprt_end_transmit(). + */ + if (task->tk_status == -EAGAIN) + return; + xprt_end_transmit(task); + rpc_task_force_reencode(task); } /* @@ -1157,12 +1152,9 @@ call_status(struct rpc_task *task) xprt_conditional_disconnect(task->tk_xprt, req->rq_connect_cookie); break; - case -ECONNRESET: case -ECONNREFUSED: - rpc_force_rebind(clnt); - rpc_delay(task, 3*HZ); - case -EPIPE: case -ENOTCONN: + rpc_force_rebind(clnt); task->tk_action = call_bind; break; case -EAGAIN: diff --git a/trunk/net/sunrpc/rpcb_clnt.c b/trunk/net/sunrpc/rpcb_clnt.c index beee6da33035..03ae007641e4 100644 --- a/trunk/net/sunrpc/rpcb_clnt.c +++ b/trunk/net/sunrpc/rpcb_clnt.c @@ -63,16 +63,9 @@ enum { * r_owner * * The "owner" is allowed to unset a service in the rpcbind database. - * - * For AF_LOCAL SET/UNSET requests, rpcbind treats this string as a - * UID which it maps to a local user name via a password lookup. - * In all other cases it is ignored. - * - * For SET/UNSET requests, user space provides a value, even for - * network requests, and GETADDR uses an empty string. We follow - * those precedents here. + * We always use the following (arbitrary) fixed string. */ -#define RPCB_OWNER_STRING "0" +#define RPCB_OWNER_STRING "rpcb" #define RPCB_MAXOWNERLEN sizeof(RPCB_OWNER_STRING) static void rpcb_getport_done(struct rpc_task *, void *); @@ -131,6 +124,12 @@ static const struct sockaddr_in rpcb_inaddr_loopback = { .sin_port = htons(RPCBIND_PORT), }; +static const struct sockaddr_in6 rpcb_in6addr_loopback = { + .sin6_family = AF_INET6, + .sin6_addr = IN6ADDR_LOOPBACK_INIT, + .sin6_port = htons(RPCBIND_PORT), +}; + static struct rpc_clnt *rpcb_create_local(struct sockaddr *addr, size_t addrlen, u32 version) { @@ -177,10 +176,9 @@ static struct rpc_clnt *rpcb_create(char *hostname, struct sockaddr *srvaddr, return rpc_create(&args); } -static int rpcb_register_call(const u32 version, struct rpc_message *msg) +static int rpcb_register_call(struct sockaddr *addr, size_t addrlen, + u32 version, struct rpc_message *msg) { - struct sockaddr *addr = (struct sockaddr *)&rpcb_inaddr_loopback; - size_t addrlen = sizeof(rpcb_inaddr_loopback); struct rpc_clnt *rpcb_clnt; int result, error = 0; @@ -194,7 +192,7 @@ static int rpcb_register_call(const u32 version, struct rpc_message *msg) error = PTR_ERR(rpcb_clnt); if (error < 0) { - dprintk("RPC: failed to contact local rpcbind " + printk(KERN_WARNING "RPC: failed to contact local rpcbind " "server (errno %d).\n", -error); return error; } @@ -256,23 +254,25 @@ int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port) if (port) msg.rpc_proc = &rpcb_procedures2[RPCBPROC_SET]; - return rpcb_register_call(RPCBVERS_2, &msg); + return rpcb_register_call((struct sockaddr *)&rpcb_inaddr_loopback, + sizeof(rpcb_inaddr_loopback), + RPCBVERS_2, &msg); } /* * Fill in AF_INET family-specific arguments to register */ -static int rpcb_register_inet4(const struct sockaddr *sap, - struct rpc_message *msg) +static int rpcb_register_netid4(struct sockaddr_in *address_to_register, + struct rpc_message *msg) { - const struct sockaddr_in *sin = (const struct sockaddr_in *)sap; struct rpcbind_args *map = msg->rpc_argp; - unsigned short port = ntohs(sin->sin_port); + unsigned short port = ntohs(address_to_register->sin_port); char buf[32]; /* Construct AF_INET universal address */ snprintf(buf, sizeof(buf), "%pI4.%u.%u", - &sin->sin_addr.s_addr, port >> 8, port & 0xff); + &address_to_register->sin_addr.s_addr, + port >> 8, port & 0xff); map->r_addr = buf; dprintk("RPC: %sregistering [%u, %u, %s, '%s'] with " @@ -284,27 +284,29 @@ static int rpcb_register_inet4(const struct sockaddr *sap, if (port) msg->rpc_proc = &rpcb_procedures4[RPCBPROC_SET]; - return rpcb_register_call(RPCBVERS_4, msg); + return rpcb_register_call((struct sockaddr *)&rpcb_inaddr_loopback, + sizeof(rpcb_inaddr_loopback), + RPCBVERS_4, msg); } /* * Fill in AF_INET6 family-specific arguments to register */ -static int rpcb_register_inet6(const struct sockaddr *sap, - struct rpc_message *msg) +static int rpcb_register_netid6(struct sockaddr_in6 *address_to_register, + struct rpc_message *msg) { - const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)sap; struct rpcbind_args *map = msg->rpc_argp; - unsigned short port = ntohs(sin6->sin6_port); + unsigned short port = ntohs(address_to_register->sin6_port); char buf[64]; /* Construct AF_INET6 universal address */ - if (ipv6_addr_any(&sin6->sin6_addr)) + if (ipv6_addr_any(&address_to_register->sin6_addr)) snprintf(buf, sizeof(buf), "::.%u.%u", port >> 8, port & 0xff); else snprintf(buf, sizeof(buf), "%pI6.%u.%u", - &sin6->sin6_addr, port >> 8, port & 0xff); + &address_to_register->sin6_addr, + port >> 8, port & 0xff); map->r_addr = buf; dprintk("RPC: %sregistering [%u, %u, %s, '%s'] with " @@ -316,21 +318,9 @@ static int rpcb_register_inet6(const struct sockaddr *sap, if (port) msg->rpc_proc = &rpcb_procedures4[RPCBPROC_SET]; - return rpcb_register_call(RPCBVERS_4, msg); -} - -static int rpcb_unregister_all_protofamilies(struct rpc_message *msg) -{ - struct rpcbind_args *map = msg->rpc_argp; - - dprintk("RPC: unregistering [%u, %u, '%s'] with " - "local rpcbind\n", - map->r_prog, map->r_vers, map->r_netid); - - map->r_addr = ""; - msg->rpc_proc = &rpcb_procedures4[RPCBPROC_UNSET]; - - return rpcb_register_call(RPCBVERS_4, msg); + return rpcb_register_call((struct sockaddr *)&rpcb_in6addr_loopback, + sizeof(rpcb_in6addr_loopback), + RPCBVERS_4, msg); } /** @@ -350,11 +340,10 @@ static int rpcb_unregister_all_protofamilies(struct rpc_message *msg) * invoke this function once for each [program, version, address, * netid] tuple they wish to advertise. * - * Callers may also unregister RPC services that are registered at a - * specific address by setting the port number in @address to zero. - * They may unregister all registered protocol families at once for - * a service by passing a NULL @address argument. If @netid is "" - * then all netids for [program, version, address] are unregistered. + * Callers may also unregister RPC services that are no longer + * available by setting the port number in the passed-in address + * to zero. Callers pass a netid of "" to unregister all + * transport netids associated with [program, version, address]. * * This function uses rpcbind protocol version 4 to contact the * local rpcbind daemon. The local rpcbind daemon must support @@ -389,14 +378,13 @@ int rpcb_v4_register(const u32 program, const u32 version, .rpc_argp = &map, }; - if (address == NULL) - return rpcb_unregister_all_protofamilies(&msg); - switch (address->sa_family) { case AF_INET: - return rpcb_register_inet4(address, &msg); + return rpcb_register_netid4((struct sockaddr_in *)address, + &msg); case AF_INET6: - return rpcb_register_inet6(address, &msg); + return rpcb_register_netid6((struct sockaddr_in6 *)address, + &msg); } return -EAFNOSUPPORT; @@ -591,7 +579,7 @@ void rpcb_getport_async(struct rpc_task *task) map->r_xprt = xprt_get(xprt); map->r_netid = rpc_peeraddr2str(clnt, RPC_DISPLAY_NETID); map->r_addr = rpc_peeraddr2str(rpcb_clnt, RPC_DISPLAY_UNIVERSAL_ADDR); - map->r_owner = ""; + map->r_owner = RPCB_OWNER_STRING; /* ignored for GETADDR */ map->r_status = -EIO; child = rpcb_call_async(rpcb_clnt, map, proc); @@ -715,16 +703,11 @@ static int rpcb_decode_getaddr(struct rpc_rqst *req, __be32 *p, *portp = 0; addr_len = ntohl(*p++); - if (addr_len == 0) { - dprintk("RPC: rpcb_decode_getaddr: " - "service is not registered\n"); - return 0; - } - /* - * Simple sanity check. + * Simple sanity check. The smallest possible universal + * address is an IPv4 address string containing 11 bytes. */ - if (addr_len > RPCBIND_MAXUADDRLEN) + if (addr_len < 11 || addr_len > RPCBIND_MAXUADDRLEN) goto out_err; /* diff --git a/trunk/net/sunrpc/svc.c b/trunk/net/sunrpc/svc.c index 9f2f2412a2f3..bb507e2bb94d 100644 --- a/trunk/net/sunrpc/svc.c +++ b/trunk/net/sunrpc/svc.c @@ -359,7 +359,7 @@ svc_pool_for_cpu(struct svc_serv *serv, int cpu) */ static struct svc_serv * __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, - void (*shutdown)(struct svc_serv *serv)) + sa_family_t family, void (*shutdown)(struct svc_serv *serv)) { struct svc_serv *serv; unsigned int vers; @@ -368,6 +368,7 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, if (!(serv = kzalloc(sizeof(*serv), GFP_KERNEL))) return NULL; + serv->sv_family = family; serv->sv_name = prog->pg_name; serv->sv_program = prog; serv->sv_nrthreads = 1; @@ -426,21 +427,21 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, struct svc_serv * svc_create(struct svc_program *prog, unsigned int bufsize, - void (*shutdown)(struct svc_serv *serv)) + sa_family_t family, void (*shutdown)(struct svc_serv *serv)) { - return __svc_create(prog, bufsize, /*npools*/1, shutdown); + return __svc_create(prog, bufsize, /*npools*/1, family, shutdown); } EXPORT_SYMBOL_GPL(svc_create); struct svc_serv * svc_create_pooled(struct svc_program *prog, unsigned int bufsize, - void (*shutdown)(struct svc_serv *serv), + sa_family_t family, void (*shutdown)(struct svc_serv *serv), svc_thread_fn func, struct module *mod) { struct svc_serv *serv; unsigned int npools = svc_pool_map_get(); - serv = __svc_create(prog, bufsize, npools, shutdown); + serv = __svc_create(prog, bufsize, npools, family, shutdown); if (serv != NULL) { serv->sv_function = func; @@ -718,6 +719,8 @@ svc_exit_thread(struct svc_rqst *rqstp) } EXPORT_SYMBOL_GPL(svc_exit_thread); +#ifdef CONFIG_SUNRPC_REGISTER_V4 + /* * Register an "inet" protocol family netid with the local * rpcbind daemon via an rpcbind v4 SET request. @@ -732,13 +735,12 @@ static int __svc_rpcb_register4(const u32 program, const u32 version, const unsigned short protocol, const unsigned short port) { - const struct sockaddr_in sin = { + struct sockaddr_in sin = { .sin_family = AF_INET, .sin_addr.s_addr = htonl(INADDR_ANY), .sin_port = htons(port), }; - const char *netid; - int error; + char *netid; switch (protocol) { case IPPROTO_UDP: @@ -748,23 +750,13 @@ static int __svc_rpcb_register4(const u32 program, const u32 version, netid = RPCBIND_NETID_TCP; break; default: - return -ENOPROTOOPT; + return -EPROTONOSUPPORT; } - error = rpcb_v4_register(program, version, - (const struct sockaddr *)&sin, netid); - - /* - * User space didn't support rpcbind v4, so retry this - * registration request with the legacy rpcbind v2 protocol. - */ - if (error == -EPROTONOSUPPORT) - error = rpcb_register(program, version, protocol, port); - - return error; + return rpcb_v4_register(program, version, + (struct sockaddr *)&sin, netid); } -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) /* * Register an "inet6" protocol family netid with the local * rpcbind daemon via an rpcbind v4 SET request. @@ -779,13 +771,12 @@ static int __svc_rpcb_register6(const u32 program, const u32 version, const unsigned short protocol, const unsigned short port) { - const struct sockaddr_in6 sin6 = { + struct sockaddr_in6 sin6 = { .sin6_family = AF_INET6, .sin6_addr = IN6ADDR_ANY_INIT, .sin6_port = htons(port), }; - const char *netid; - int error; + char *netid; switch (protocol) { case IPPROTO_UDP: @@ -795,22 +786,12 @@ static int __svc_rpcb_register6(const u32 program, const u32 version, netid = RPCBIND_NETID_TCP6; break; default: - return -ENOPROTOOPT; + return -EPROTONOSUPPORT; } - error = rpcb_v4_register(program, version, - (const struct sockaddr *)&sin6, netid); - - /* - * User space didn't support rpcbind version 4, so we won't - * use a PF_INET6 listener. - */ - if (error == -EPROTONOSUPPORT) - error = -EAFNOSUPPORT; - - return error; + return rpcb_v4_register(program, version, + (struct sockaddr *)&sin6, netid); } -#endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */ /* * Register a kernel RPC service via rpcbind version 4. @@ -818,43 +799,69 @@ static int __svc_rpcb_register6(const u32 program, const u32 version, * Returns zero on success; a negative errno value is returned * if any error occurs. */ -static int __svc_register(const char *progname, - const u32 program, const u32 version, - const int family, +static int __svc_register(const u32 program, const u32 version, + const sa_family_t family, const unsigned short protocol, const unsigned short port) { - int error = -EAFNOSUPPORT; + int error; switch (family) { - case PF_INET: - error = __svc_rpcb_register4(program, version, + case AF_INET: + return __svc_rpcb_register4(program, version, protocol, port); - break; -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) - case PF_INET6: + case AF_INET6: error = __svc_rpcb_register6(program, version, protocol, port); -#endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */ + if (error < 0) + return error; + + /* + * Work around bug in some versions of Linux rpcbind + * which don't allow registration of both inet and + * inet6 netids. + * + * Error return ignored for now. + */ + __svc_rpcb_register4(program, version, + protocol, port); + return 0; } - if (error < 0) - printk(KERN_WARNING "svc: failed to register %sv%u RPC " - "service (errno %d).\n", progname, version, -error); - return error; + return -EAFNOSUPPORT; +} + +#else /* CONFIG_SUNRPC_REGISTER_V4 */ + +/* + * Register a kernel RPC service via rpcbind version 2. + * + * Returns zero on success; a negative errno value is returned + * if any error occurs. + */ +static int __svc_register(const u32 program, const u32 version, + sa_family_t family, + const unsigned short protocol, + const unsigned short port) +{ + if (family != AF_INET) + return -EAFNOSUPPORT; + + return rpcb_register(program, version, protocol, port); } +#endif /* CONFIG_SUNRPC_REGISTER_V4 */ + /** * svc_register - register an RPC service with the local portmapper * @serv: svc_serv struct for the service to register - * @family: protocol family of service's listener socket * @proto: transport protocol number to advertise * @port: port to advertise * - * Service is registered for any address in the passed-in protocol family + * Service is registered for any address in serv's address family */ -int svc_register(const struct svc_serv *serv, const int family, - const unsigned short proto, const unsigned short port) +int svc_register(const struct svc_serv *serv, const unsigned short proto, + const unsigned short port) { struct svc_program *progp; unsigned int i; @@ -872,15 +879,15 @@ int svc_register(const struct svc_serv *serv, const int family, i, proto == IPPROTO_UDP? "udp" : "tcp", port, - family, + serv->sv_family, progp->pg_vers[i]->vs_hidden? " (but not telling portmap)" : ""); if (progp->pg_vers[i]->vs_hidden) continue; - error = __svc_register(progp->pg_name, progp->pg_prog, - i, family, proto, port); + error = __svc_register(progp->pg_prog, i, + serv->sv_family, proto, port); if (error < 0) break; } @@ -889,31 +896,38 @@ int svc_register(const struct svc_serv *serv, const int family, return error; } -/* - * If user space is running rpcbind, it should take the v4 UNSET - * and clear everything for this [program, version]. If user space - * is running portmap, it will reject the v4 UNSET, but won't have - * any "inet6" entries anyway. So a PMAP_UNSET should be sufficient - * in this case to clear all existing entries for [program, version]. - */ +#ifdef CONFIG_SUNRPC_REGISTER_V4 + static void __svc_unregister(const u32 program, const u32 version, const char *progname) { + struct sockaddr_in6 sin6 = { + .sin6_family = AF_INET6, + .sin6_addr = IN6ADDR_ANY_INIT, + .sin6_port = 0, + }; int error; - error = rpcb_v4_register(program, version, NULL, ""); + error = rpcb_v4_register(program, version, + (struct sockaddr *)&sin6, ""); + dprintk("svc: %s(%sv%u), error %d\n", + __func__, progname, version, error); +} - /* - * User space didn't support rpcbind v4, so retry this - * request with the legacy rpcbind v2 protocol. - */ - if (error == -EPROTONOSUPPORT) - error = rpcb_register(program, version, 0, 0); +#else /* CONFIG_SUNRPC_REGISTER_V4 */ +static void __svc_unregister(const u32 program, const u32 version, + const char *progname) +{ + int error; + + error = rpcb_register(program, version, 0, 0); dprintk("svc: %s(%sv%u), error %d\n", __func__, progname, version, error); } +#endif /* CONFIG_SUNRPC_REGISTER_V4 */ + /* * All netids, bind addresses and ports registered for [program, version] * are removed from the local rpcbind database (if the service is not diff --git a/trunk/net/sunrpc/svc_xprt.c b/trunk/net/sunrpc/svc_xprt.c index 2819ee093f36..e588df5d6b34 100644 --- a/trunk/net/sunrpc/svc_xprt.c +++ b/trunk/net/sunrpc/svc_xprt.c @@ -161,9 +161,7 @@ EXPORT_SYMBOL_GPL(svc_xprt_init); static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl, struct svc_serv *serv, - const int family, - const unsigned short port, - int flags) + unsigned short port, int flags) { struct sockaddr_in sin = { .sin_family = AF_INET, @@ -178,12 +176,12 @@ static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl, struct sockaddr *sap; size_t len; - switch (family) { - case PF_INET: + switch (serv->sv_family) { + case AF_INET: sap = (struct sockaddr *)&sin; len = sizeof(sin); break; - case PF_INET6: + case AF_INET6: sap = (struct sockaddr *)&sin6; len = sizeof(sin6); break; @@ -194,8 +192,7 @@ static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl, return xcl->xcl_ops->xpo_create(serv, sap, len, flags); } -int svc_create_xprt(struct svc_serv *serv, const char *xprt_name, - const int family, const unsigned short port, +int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port, int flags) { struct svc_xprt_class *xcl; @@ -212,7 +209,7 @@ int svc_create_xprt(struct svc_serv *serv, const char *xprt_name, goto err; spin_unlock(&svc_xprt_class_lock); - newxprt = __svc_xpo_create(xcl, serv, family, port, flags); + newxprt = __svc_xpo_create(xcl, serv, port, flags); if (IS_ERR(newxprt)) { module_put(xcl->xcl_owner); return PTR_ERR(newxprt); @@ -1036,13 +1033,7 @@ static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt) return dr; } -/** - * svc_find_xprt - find an RPC transport instance - * @serv: pointer to svc_serv to search - * @xcl_name: C string containing transport's class name - * @af: Address family of transport's local address - * @port: transport's IP port number - * +/* * Return the transport instance pointer for the endpoint accepting * connections/peer traffic from the specified transport class, * address family and port. @@ -1051,14 +1042,14 @@ static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt) * wild-card, and will result in matching the first transport in the * service's list that has a matching class name. */ -struct svc_xprt *svc_find_xprt(struct svc_serv *serv, const char *xcl_name, - const sa_family_t af, const unsigned short port) +struct svc_xprt *svc_find_xprt(struct svc_serv *serv, char *xcl_name, + int af, int port) { struct svc_xprt *xprt; struct svc_xprt *found = NULL; /* Sanity check the args */ - if (serv == NULL || xcl_name == NULL) + if (!serv || !xcl_name) return found; spin_lock_bh(&serv->sv_lock); @@ -1067,7 +1058,7 @@ struct svc_xprt *svc_find_xprt(struct svc_serv *serv, const char *xcl_name, continue; if (af != AF_UNSPEC && af != xprt->xpt_local.ss_family) continue; - if (port != 0 && port != svc_xprt_local_port(xprt)) + if (port && port != svc_xprt_local_port(xprt)) continue; found = xprt; svc_xprt_get(xprt); diff --git a/trunk/net/sunrpc/svcsock.c b/trunk/net/sunrpc/svcsock.c index 9d504234af4a..5763e6460fea 100644 --- a/trunk/net/sunrpc/svcsock.c +++ b/trunk/net/sunrpc/svcsock.c @@ -1110,6 +1110,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, struct svc_sock *svsk; struct sock *inet; int pmap_register = !(flags & SVC_SOCK_ANONYMOUS); + int val; dprintk("svc: svc_setup_socket %p\n", sock); if (!(svsk = kzalloc(sizeof(*svsk), GFP_KERNEL))) { @@ -1121,7 +1122,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, /* Register socket with portmapper */ if (*errp >= 0 && pmap_register) - *errp = svc_register(serv, inet->sk_family, inet->sk_protocol, + *errp = svc_register(serv, inet->sk_protocol, ntohs(inet_sk(inet)->sport)); if (*errp < 0) { @@ -1142,6 +1143,18 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, else svc_tcp_init(svsk, serv); + /* + * We start one listener per sv_serv. We want AF_INET + * requests to be automatically shunted to our AF_INET6 + * listener using a mapped IPv4 address. Make sure + * no-one starts an equivalent IPv4 listener, which + * would steal our incoming connections. + */ + val = 0; + if (serv->sv_family == AF_INET6) + kernel_setsockopt(sock, SOL_IPV6, IPV6_V6ONLY, + (char *)&val, sizeof(val)); + dprintk("svc: svc_setup_socket created %p (inet %p)\n", svsk, svsk->sk_sk); @@ -1209,8 +1222,6 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv, struct sockaddr_storage addr; struct sockaddr *newsin = (struct sockaddr *)&addr; int newlen; - int family; - int val; RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); dprintk("svc: svc_create_socket(%s, %d, %s)\n", @@ -1222,35 +1233,14 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv, "sockets supported\n"); return ERR_PTR(-EINVAL); } - type = (protocol == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM; - switch (sin->sa_family) { - case AF_INET6: - family = PF_INET6; - break; - case AF_INET: - family = PF_INET; - break; - default: - return ERR_PTR(-EINVAL); - } - error = sock_create_kern(family, type, protocol, &sock); + error = sock_create_kern(sin->sa_family, type, protocol, &sock); if (error < 0) return ERR_PTR(error); svc_reclassify_socket(sock); - /* - * If this is an PF_INET6 listener, we want to avoid - * getting requests from IPv4 remotes. Those should - * be shunted to a PF_INET listener via rpcbind. - */ - val = 1; - if (family == PF_INET6) - kernel_setsockopt(sock, SOL_IPV6, IPV6_V6ONLY, - (char *)&val, sizeof(val)); - if (type == SOCK_STREAM) sock->sk->sk_reuse = 1; /* allow address reuse */ error = kernel_bind(sock, sin, len); diff --git a/trunk/net/sunrpc/xprt.c b/trunk/net/sunrpc/xprt.c index a0bfe53f1621..62098d101a1f 100644 --- a/trunk/net/sunrpc/xprt.c +++ b/trunk/net/sunrpc/xprt.c @@ -151,37 +151,6 @@ int xprt_unregister_transport(struct xprt_class *transport) } EXPORT_SYMBOL_GPL(xprt_unregister_transport); -/** - * xprt_load_transport - load a transport implementation - * @transport_name: transport to load - * - * Returns: - * 0: transport successfully loaded - * -ENOENT: transport module not available - */ -int xprt_load_transport(const char *transport_name) -{ - struct xprt_class *t; - char module_name[sizeof t->name + 5]; - int result; - - result = 0; - spin_lock(&xprt_list_lock); - list_for_each_entry(t, &xprt_list, list) { - if (strcmp(t->name, transport_name) == 0) { - spin_unlock(&xprt_list_lock); - goto out; - } - } - spin_unlock(&xprt_list_lock); - strcpy(module_name, "xprt"); - strncat(module_name, transport_name, sizeof t->name); - result = request_module(module_name); -out: - return result; -} -EXPORT_SYMBOL_GPL(xprt_load_transport); - /** * xprt_reserve_xprt - serialize write access to transports * @task: task that is requesting access to the transport @@ -611,7 +580,7 @@ void xprt_disconnect_done(struct rpc_xprt *xprt) dprintk("RPC: disconnected transport %p\n", xprt); spin_lock_bh(&xprt->transport_lock); xprt_clear_connected(xprt); - xprt_wake_pending_tasks(xprt, -EAGAIN); + xprt_wake_pending_tasks(xprt, -ENOTCONN); spin_unlock_bh(&xprt->transport_lock); } EXPORT_SYMBOL_GPL(xprt_disconnect_done); @@ -629,7 +598,7 @@ void xprt_force_disconnect(struct rpc_xprt *xprt) /* Try to schedule an autoclose RPC call */ if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0) queue_work(rpciod_workqueue, &xprt->task_cleanup); - xprt_wake_pending_tasks(xprt, -EAGAIN); + xprt_wake_pending_tasks(xprt, -ENOTCONN); spin_unlock_bh(&xprt->transport_lock); } @@ -656,7 +625,7 @@ void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie) /* Try to schedule an autoclose RPC call */ if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0) queue_work(rpciod_workqueue, &xprt->task_cleanup); - xprt_wake_pending_tasks(xprt, -EAGAIN); + xprt_wake_pending_tasks(xprt, -ENOTCONN); out: spin_unlock_bh(&xprt->transport_lock); } @@ -726,8 +695,9 @@ static void xprt_connect_status(struct rpc_task *task) } switch (task->tk_status) { - case -EAGAIN: - dprintk("RPC: %5u xprt_connect_status: retrying\n", task->tk_pid); + case -ENOTCONN: + dprintk("RPC: %5u xprt_connect_status: connection broken\n", + task->tk_pid); break; case -ETIMEDOUT: dprintk("RPC: %5u xprt_connect_status: connect attempt timed " @@ -848,8 +818,15 @@ int xprt_prepare_transmit(struct rpc_task *task) err = req->rq_received; goto out_unlock; } - if (!xprt->ops->reserve_xprt(task)) + if (!xprt->ops->reserve_xprt(task)) { err = -EAGAIN; + goto out_unlock; + } + + if (!xprt_connected(xprt)) { + err = -ENOTCONN; + goto out_unlock; + } out_unlock: spin_unlock_bh(&xprt->transport_lock); return err; @@ -893,26 +870,32 @@ void xprt_transmit(struct rpc_task *task) req->rq_connect_cookie = xprt->connect_cookie; req->rq_xtime = jiffies; status = xprt->ops->send_request(task); - if (status != 0) { - task->tk_status = status; - return; - } + if (status == 0) { + dprintk("RPC: %5u xmit complete\n", task->tk_pid); + spin_lock_bh(&xprt->transport_lock); - dprintk("RPC: %5u xmit complete\n", task->tk_pid); - spin_lock_bh(&xprt->transport_lock); + xprt->ops->set_retrans_timeout(task); - xprt->ops->set_retrans_timeout(task); + xprt->stat.sends++; + xprt->stat.req_u += xprt->stat.sends - xprt->stat.recvs; + xprt->stat.bklog_u += xprt->backlog.qlen; - xprt->stat.sends++; - xprt->stat.req_u += xprt->stat.sends - xprt->stat.recvs; - xprt->stat.bklog_u += xprt->backlog.qlen; + /* Don't race with disconnect */ + if (!xprt_connected(xprt)) + task->tk_status = -ENOTCONN; + else if (!req->rq_received) + rpc_sleep_on(&xprt->pending, task, xprt_timer); + spin_unlock_bh(&xprt->transport_lock); + return; + } - /* Don't race with disconnect */ - if (!xprt_connected(xprt)) - task->tk_status = -ENOTCONN; - else if (!req->rq_received) - rpc_sleep_on(&xprt->pending, task, xprt_timer); - spin_unlock_bh(&xprt->transport_lock); + /* Note: at this point, task->tk_sleeping has not yet been set, + * hence there is no danger of the waking up task being put on + * schedq, and being picked up by a parallel run of rpciod(). + */ + task->tk_status = status; + if (status == -ECONNREFUSED) + rpc_sleep_on(&xprt->sending, task, NULL); } static inline void do_xprt_reserve(struct rpc_task *task) diff --git a/trunk/net/sunrpc/xprtrdma/rpc_rdma.c b/trunk/net/sunrpc/xprtrdma/rpc_rdma.c index e5e28d1946a4..14106d26bb95 100644 --- a/trunk/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/trunk/net/sunrpc/xprtrdma/rpc_rdma.c @@ -310,19 +310,6 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad) __func__, pad, destp, rqst->rq_slen, curlen); copy_len = rqst->rq_snd_buf.page_len; - - if (rqst->rq_snd_buf.tail[0].iov_len) { - curlen = rqst->rq_snd_buf.tail[0].iov_len; - if (destp + copy_len != rqst->rq_snd_buf.tail[0].iov_base) { - memmove(destp + copy_len, - rqst->rq_snd_buf.tail[0].iov_base, curlen); - r_xprt->rx_stats.pullup_copy_count += curlen; - } - dprintk("RPC: %s: tail destp 0x%p len %d\n", - __func__, destp + copy_len, curlen); - rqst->rq_svec[0].iov_len += curlen; - } - r_xprt->rx_stats.pullup_copy_count += copy_len; npages = PAGE_ALIGN(rqst->rq_snd_buf.page_base+copy_len) >> PAGE_SHIFT; for (i = 0; copy_len && i < npages; i++) { @@ -345,6 +332,17 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad) destp += curlen; copy_len -= curlen; } + if (rqst->rq_snd_buf.tail[0].iov_len) { + curlen = rqst->rq_snd_buf.tail[0].iov_len; + if (destp != rqst->rq_snd_buf.tail[0].iov_base) { + memcpy(destp, + rqst->rq_snd_buf.tail[0].iov_base, curlen); + r_xprt->rx_stats.pullup_copy_count += curlen; + } + dprintk("RPC: %s: tail destp 0x%p len %d curlen %d\n", + __func__, destp, copy_len, curlen); + rqst->rq_svec[0].iov_len += curlen; + } /* header now contains entire send message */ return pad; } @@ -658,7 +656,7 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) if (curlen > rqst->rq_rcv_buf.tail[0].iov_len) curlen = rqst->rq_rcv_buf.tail[0].iov_len; if (rqst->rq_rcv_buf.tail[0].iov_base != srcp) - memmove(rqst->rq_rcv_buf.tail[0].iov_base, srcp, curlen); + memcpy(rqst->rq_rcv_buf.tail[0].iov_base, srcp, curlen); dprintk("RPC: %s: tail srcp 0x%p len %d curlen %d\n", __func__, srcp, copy_len, curlen); rqst->rq_rcv_buf.tail[0].iov_len = curlen; diff --git a/trunk/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/trunk/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 6c26a675435a..a3334e3b73cc 100644 --- a/trunk/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/trunk/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -191,6 +191,7 @@ static int map_xdr(struct svcxprt_rdma *xprt, struct xdr_buf *xdr, struct svc_rdma_req_map *vec) { + int sge_max = (xdr->len+PAGE_SIZE-1) / PAGE_SIZE + 3; int sge_no; u32 sge_bytes; u32 page_bytes; @@ -234,11 +235,7 @@ static int map_xdr(struct svcxprt_rdma *xprt, sge_no++; } - dprintk("svcrdma: map_xdr: sge_no %d page_no %d " - "page_base %u page_len %u head_len %zu tail_len %zu\n", - sge_no, page_no, xdr->page_base, xdr->page_len, - xdr->head[0].iov_len, xdr->tail[0].iov_len); - + BUG_ON(sge_no > sge_max); vec->count = sge_no; return 0; } @@ -582,6 +579,7 @@ static int send_reply(struct svcxprt_rdma *rdma, ctxt->sge[page_no+1].length = 0; } BUG_ON(sge_no > rdma->sc_max_sge); + BUG_ON(sge_no > ctxt->count); memset(&send_wr, 0, sizeof send_wr); ctxt->wr_op = IB_WR_SEND; send_wr.wr_id = (unsigned long)ctxt; diff --git a/trunk/net/sunrpc/xprtsock.c b/trunk/net/sunrpc/xprtsock.c index d40ff50887aa..568330eebbfe 100644 --- a/trunk/net/sunrpc/xprtsock.c +++ b/trunk/net/sunrpc/xprtsock.c @@ -49,9 +49,6 @@ unsigned int xprt_tcp_slot_table_entries = RPC_DEF_SLOT_TABLE; unsigned int xprt_min_resvport = RPC_DEF_MIN_RESVPORT; unsigned int xprt_max_resvport = RPC_DEF_MAX_RESVPORT; -#define XS_TCP_LINGER_TO (15U * HZ) -static unsigned int xs_tcp_fin_timeout __read_mostly = XS_TCP_LINGER_TO; - /* * We can register our own files under /proc/sys/sunrpc by * calling register_sysctl_table() again. The files in that @@ -119,14 +116,6 @@ static ctl_table xs_tunables_table[] = { .extra1 = &xprt_min_resvport_limit, .extra2 = &xprt_max_resvport_limit }, - { - .procname = "tcp_fin_timeout", - .data = &xs_tcp_fin_timeout, - .maxlen = sizeof(xs_tcp_fin_timeout), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = sysctl_jiffies - }, { .ctl_name = 0, }, @@ -532,12 +521,11 @@ static void xs_nospace_callback(struct rpc_task *task) * @task: task to put to sleep * */ -static int xs_nospace(struct rpc_task *task) +static void xs_nospace(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); - int ret = 0; dprintk("RPC: %5u xmit incomplete (%u left of %u)\n", task->tk_pid, req->rq_slen - req->rq_bytes_sent, @@ -549,7 +537,6 @@ static int xs_nospace(struct rpc_task *task) /* Don't race with disconnect */ if (xprt_connected(xprt)) { if (test_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags)) { - ret = -EAGAIN; /* * Notify TCP that we're limited by the application * window size @@ -561,11 +548,10 @@ static int xs_nospace(struct rpc_task *task) } } else { clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags); - ret = -ENOTCONN; + task->tk_status = -ENOTCONN; } spin_unlock_bh(&xprt->transport_lock); - return ret; } /** @@ -608,8 +594,6 @@ static int xs_udp_send_request(struct rpc_task *task) /* Still some bytes left; set up for a retry later. */ status = -EAGAIN; } - if (!transport->sock) - goto out; switch (status) { case -ENOTSOCK: @@ -617,19 +601,21 @@ static int xs_udp_send_request(struct rpc_task *task) /* Should we call xs_close() here? */ break; case -EAGAIN: - status = xs_nospace(task); + xs_nospace(task); break; - default: - dprintk("RPC: sendmsg returned unrecognized error %d\n", - -status); case -ENETUNREACH: case -EPIPE: case -ECONNREFUSED: /* When the server has died, an ICMP port unreachable message * prompts ECONNREFUSED. */ clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags); + break; + default: + clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags); + dprintk("RPC: sendmsg returned unrecognized error %d\n", + -status); } -out: + return status; } @@ -711,8 +697,6 @@ static int xs_tcp_send_request(struct rpc_task *task) status = -EAGAIN; break; } - if (!transport->sock) - goto out; switch (status) { case -ENOTSOCK: @@ -720,19 +704,23 @@ static int xs_tcp_send_request(struct rpc_task *task) /* Should we call xs_close() here? */ break; case -EAGAIN: - status = xs_nospace(task); + xs_nospace(task); break; - default: - dprintk("RPC: sendmsg returned unrecognized error %d\n", - -status); case -ECONNRESET: - case -EPIPE: xs_tcp_shutdown(xprt); case -ECONNREFUSED: case -ENOTCONN: + case -EPIPE: + status = -ENOTCONN; + clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags); + break; + default: + dprintk("RPC: sendmsg returned unrecognized error %d\n", + -status); clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags); + xs_tcp_shutdown(xprt); } -out: + return status; } @@ -779,13 +767,23 @@ static void xs_restore_old_callbacks(struct sock_xprt *transport, struct sock *s sk->sk_error_report = transport->old_error_report; } -static void xs_reset_transport(struct sock_xprt *transport) +/** + * xs_close - close a socket + * @xprt: transport + * + * This is used when all requests are complete; ie, no DRC state remains + * on the server we want to save. + */ +static void xs_close(struct rpc_xprt *xprt) { + struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); struct socket *sock = transport->sock; struct sock *sk = transport->inet; - if (sk == NULL) - return; + if (!sk) + goto clear_close_wait; + + dprintk("RPC: xs_close xprt %p\n", xprt); write_lock_bh(&sk->sk_callback_lock); transport->inet = NULL; @@ -799,25 +797,8 @@ static void xs_reset_transport(struct sock_xprt *transport) sk->sk_no_check = 0; sock_release(sock); -} - -/** - * xs_close - close a socket - * @xprt: transport - * - * This is used when all requests are complete; ie, no DRC state remains - * on the server we want to save. - */ -static void xs_close(struct rpc_xprt *xprt) -{ - struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); - - dprintk("RPC: xs_close xprt %p\n", xprt); - - xs_reset_transport(transport); - +clear_close_wait: smp_mb__before_clear_bit(); - clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); clear_bit(XPRT_CLOSE_WAIT, &xprt->state); clear_bit(XPRT_CLOSING, &xprt->state); smp_mb__after_clear_bit(); @@ -1145,47 +1126,6 @@ static void xs_tcp_data_ready(struct sock *sk, int bytes) read_unlock(&sk->sk_callback_lock); } -/* - * Do the equivalent of linger/linger2 handling for dealing with - * broken servers that don't close the socket in a timely - * fashion - */ -static void xs_tcp_schedule_linger_timeout(struct rpc_xprt *xprt, - unsigned long timeout) -{ - struct sock_xprt *transport; - - if (xprt_test_and_set_connecting(xprt)) - return; - set_bit(XPRT_CONNECTION_ABORT, &xprt->state); - transport = container_of(xprt, struct sock_xprt, xprt); - queue_delayed_work(rpciod_workqueue, &transport->connect_worker, - timeout); -} - -static void xs_tcp_cancel_linger_timeout(struct rpc_xprt *xprt) -{ - struct sock_xprt *transport; - - transport = container_of(xprt, struct sock_xprt, xprt); - - if (!test_bit(XPRT_CONNECTION_ABORT, &xprt->state) || - !cancel_delayed_work(&transport->connect_worker)) - return; - clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); - xprt_clear_connecting(xprt); -} - -static void xs_sock_mark_closed(struct rpc_xprt *xprt) -{ - smp_mb__before_clear_bit(); - clear_bit(XPRT_CLOSE_WAIT, &xprt->state); - clear_bit(XPRT_CLOSING, &xprt->state); - smp_mb__after_clear_bit(); - /* Mark transport as closed and wake up all pending tasks */ - xprt_disconnect_done(xprt); -} - /** * xs_tcp_state_change - callback to handle TCP socket state changes * @sk: socket whose state has changed @@ -1218,7 +1158,7 @@ static void xs_tcp_state_change(struct sock *sk) transport->tcp_flags = TCP_RCV_COPY_FRAGHDR | TCP_RCV_COPY_XID; - xprt_wake_pending_tasks(xprt, -EAGAIN); + xprt_wake_pending_tasks(xprt, 0); } spin_unlock_bh(&xprt->transport_lock); break; @@ -1231,10 +1171,10 @@ static void xs_tcp_state_change(struct sock *sk) clear_bit(XPRT_CONNECTED, &xprt->state); clear_bit(XPRT_CLOSE_WAIT, &xprt->state); smp_mb__after_clear_bit(); - xs_tcp_schedule_linger_timeout(xprt, xs_tcp_fin_timeout); break; case TCP_CLOSE_WAIT: /* The server initiated a shutdown of the socket */ + set_bit(XPRT_CLOSING, &xprt->state); xprt_force_disconnect(xprt); case TCP_SYN_SENT: xprt->connect_cookie++; @@ -1247,35 +1187,40 @@ static void xs_tcp_state_change(struct sock *sk) xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO; break; case TCP_LAST_ACK: - set_bit(XPRT_CLOSING, &xprt->state); - xs_tcp_schedule_linger_timeout(xprt, xs_tcp_fin_timeout); smp_mb__before_clear_bit(); clear_bit(XPRT_CONNECTED, &xprt->state); smp_mb__after_clear_bit(); break; case TCP_CLOSE: - xs_tcp_cancel_linger_timeout(xprt); - xs_sock_mark_closed(xprt); + smp_mb__before_clear_bit(); + clear_bit(XPRT_CLOSE_WAIT, &xprt->state); + clear_bit(XPRT_CLOSING, &xprt->state); + smp_mb__after_clear_bit(); + /* Mark transport as closed and wake up all pending tasks */ + xprt_disconnect_done(xprt); } out: read_unlock(&sk->sk_callback_lock); } /** - * xs_error_report - callback mainly for catching socket errors + * xs_tcp_error_report - callback mainly for catching RST events * @sk: socket */ -static void xs_error_report(struct sock *sk) +static void xs_tcp_error_report(struct sock *sk) { struct rpc_xprt *xprt; read_lock(&sk->sk_callback_lock); + if (sk->sk_err != ECONNRESET || sk->sk_state != TCP_ESTABLISHED) + goto out; if (!(xprt = xprt_from_sock(sk))) goto out; dprintk("RPC: %s client %p...\n" "RPC: error %d\n", __func__, xprt, sk->sk_err); - xprt_wake_pending_tasks(xprt, -EAGAIN); + + xprt_force_disconnect(xprt); out: read_unlock(&sk->sk_callback_lock); } @@ -1549,7 +1494,6 @@ static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) sk->sk_user_data = xprt; sk->sk_data_ready = xs_udp_data_ready; sk->sk_write_space = xs_udp_write_space; - sk->sk_error_report = xs_error_report; sk->sk_no_check = UDP_CSUM_NORCV; sk->sk_allocation = GFP_ATOMIC; @@ -1582,10 +1526,9 @@ static void xs_udp_connect_worker4(struct work_struct *work) goto out; /* Start by resetting any existing state */ - xs_reset_transport(transport); + xs_close(xprt); - err = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); - if (err < 0) { + if ((err = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock)) < 0) { dprintk("RPC: can't create UDP transport socket (%d).\n", -err); goto out; } @@ -1602,8 +1545,8 @@ static void xs_udp_connect_worker4(struct work_struct *work) xs_udp_finish_connecting(xprt, sock); status = 0; out: - xprt_clear_connecting(xprt); xprt_wake_pending_tasks(xprt, status); + xprt_clear_connecting(xprt); } /** @@ -1624,10 +1567,9 @@ static void xs_udp_connect_worker6(struct work_struct *work) goto out; /* Start by resetting any existing state */ - xs_reset_transport(transport); + xs_close(xprt); - err = sock_create_kern(PF_INET6, SOCK_DGRAM, IPPROTO_UDP, &sock); - if (err < 0) { + if ((err = sock_create_kern(PF_INET6, SOCK_DGRAM, IPPROTO_UDP, &sock)) < 0) { dprintk("RPC: can't create UDP transport socket (%d).\n", -err); goto out; } @@ -1644,17 +1586,18 @@ static void xs_udp_connect_worker6(struct work_struct *work) xs_udp_finish_connecting(xprt, sock); status = 0; out: - xprt_clear_connecting(xprt); xprt_wake_pending_tasks(xprt, status); + xprt_clear_connecting(xprt); } /* * We need to preserve the port number so the reply cache on the server can * find our cached RPC replies when we get around to reconnecting. */ -static void xs_abort_connection(struct rpc_xprt *xprt, struct sock_xprt *transport) +static void xs_tcp_reuse_connection(struct rpc_xprt *xprt) { int result; + struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); struct sockaddr any; dprintk("RPC: disconnecting xprt %p to reuse port\n", xprt); @@ -1666,24 +1609,11 @@ static void xs_abort_connection(struct rpc_xprt *xprt, struct sock_xprt *transpo memset(&any, 0, sizeof(any)); any.sa_family = AF_UNSPEC; result = kernel_connect(transport->sock, &any, sizeof(any), 0); - if (!result) - xs_sock_mark_closed(xprt); - else + if (result) dprintk("RPC: AF_UNSPEC connect return code %d\n", result); } -static void xs_tcp_reuse_connection(struct rpc_xprt *xprt, struct sock_xprt *transport) -{ - unsigned int state = transport->inet->sk_state; - - if (state == TCP_CLOSE && transport->sock->state == SS_UNCONNECTED) - return; - if ((1 << state) & (TCPF_ESTABLISHED|TCPF_SYN_SENT)) - return; - xs_abort_connection(xprt, transport); -} - static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); @@ -1699,7 +1629,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) sk->sk_data_ready = xs_tcp_data_ready; sk->sk_state_change = xs_tcp_state_change; sk->sk_write_space = xs_tcp_write_space; - sk->sk_error_report = xs_error_report; + sk->sk_error_report = xs_tcp_error_report; sk->sk_allocation = GFP_ATOMIC; /* socket options */ @@ -1727,42 +1657,37 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) } /** - * xs_tcp_setup_socket - create a TCP socket and connect to a remote endpoint - * @xprt: RPC transport to connect - * @transport: socket transport to connect - * @create_sock: function to create a socket of the correct type + * xs_tcp_connect_worker4 - connect a TCP socket to a remote endpoint + * @work: RPC transport to connect * * Invoked by a work queue tasklet. */ -static void xs_tcp_setup_socket(struct rpc_xprt *xprt, - struct sock_xprt *transport, - struct socket *(*create_sock)(struct rpc_xprt *, - struct sock_xprt *)) +static void xs_tcp_connect_worker4(struct work_struct *work) { + struct sock_xprt *transport = + container_of(work, struct sock_xprt, connect_worker.work); + struct rpc_xprt *xprt = &transport->xprt; struct socket *sock = transport->sock; - int status = -EIO; + int err, status = -EIO; if (xprt->shutdown) goto out; if (!sock) { - clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); - sock = create_sock(xprt, transport); - if (IS_ERR(sock)) { - status = PTR_ERR(sock); + /* start from scratch */ + if ((err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) { + dprintk("RPC: can't create TCP transport socket (%d).\n", -err); goto out; } - } else { - int abort_and_exit; + xs_reclassify_socket4(sock); - abort_and_exit = test_and_clear_bit(XPRT_CONNECTION_ABORT, - &xprt->state); + if (xs_bind4(transport, sock) < 0) { + sock_release(sock); + goto out; + } + } else /* "close" the socket, preserving the local port */ - xs_tcp_reuse_connection(xprt, transport); - - if (abort_and_exit) - goto out_eagain; - } + xs_tcp_reuse_connection(xprt); dprintk("RPC: worker connecting xprt %p to address: %s\n", xprt, xprt->address_strings[RPC_DISPLAY_ALL]); @@ -1771,104 +1696,83 @@ static void xs_tcp_setup_socket(struct rpc_xprt *xprt, dprintk("RPC: %p connect status %d connected %d sock state %d\n", xprt, -status, xprt_connected(xprt), sock->sk->sk_state); - switch (status) { - case -ECONNREFUSED: - case -ECONNRESET: - case -ENETUNREACH: - /* retry with existing socket, after a delay */ - case 0: - case -EINPROGRESS: - case -EALREADY: - xprt_clear_connecting(xprt); - return; + if (status < 0) { + switch (status) { + case -EINPROGRESS: + case -EALREADY: + goto out_clear; + case -ECONNREFUSED: + case -ECONNRESET: + /* retry with existing socket, after a delay */ + break; + default: + /* get rid of existing socket, and retry */ + xs_tcp_shutdown(xprt); + } } - /* get rid of existing socket, and retry */ - xs_tcp_shutdown(xprt); - printk("%s: connect returned unhandled error %d\n", - __func__, status); -out_eagain: - status = -EAGAIN; out: - xprt_clear_connecting(xprt); xprt_wake_pending_tasks(xprt, status); -} - -static struct socket *xs_create_tcp_sock4(struct rpc_xprt *xprt, - struct sock_xprt *transport) -{ - struct socket *sock; - int err; - - /* start from scratch */ - err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); - if (err < 0) { - dprintk("RPC: can't create TCP transport socket (%d).\n", - -err); - goto out_err; - } - xs_reclassify_socket4(sock); - - if (xs_bind4(transport, sock) < 0) { - sock_release(sock); - goto out_err; - } - return sock; -out_err: - return ERR_PTR(-EIO); +out_clear: + xprt_clear_connecting(xprt); } /** - * xs_tcp_connect_worker4 - connect a TCP socket to a remote endpoint + * xs_tcp_connect_worker6 - connect a TCP socket to a remote endpoint * @work: RPC transport to connect * * Invoked by a work queue tasklet. */ -static void xs_tcp_connect_worker4(struct work_struct *work) +static void xs_tcp_connect_worker6(struct work_struct *work) { struct sock_xprt *transport = container_of(work, struct sock_xprt, connect_worker.work); struct rpc_xprt *xprt = &transport->xprt; + struct socket *sock = transport->sock; + int err, status = -EIO; - xs_tcp_setup_socket(xprt, transport, xs_create_tcp_sock4); -} + if (xprt->shutdown) + goto out; -static struct socket *xs_create_tcp_sock6(struct rpc_xprt *xprt, - struct sock_xprt *transport) -{ - struct socket *sock; - int err; - - /* start from scratch */ - err = sock_create_kern(PF_INET6, SOCK_STREAM, IPPROTO_TCP, &sock); - if (err < 0) { - dprintk("RPC: can't create TCP transport socket (%d).\n", - -err); - goto out_err; - } - xs_reclassify_socket6(sock); + if (!sock) { + /* start from scratch */ + if ((err = sock_create_kern(PF_INET6, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) { + dprintk("RPC: can't create TCP transport socket (%d).\n", -err); + goto out; + } + xs_reclassify_socket6(sock); - if (xs_bind6(transport, sock) < 0) { - sock_release(sock); - goto out_err; - } - return sock; -out_err: - return ERR_PTR(-EIO); -} + if (xs_bind6(transport, sock) < 0) { + sock_release(sock); + goto out; + } + } else + /* "close" the socket, preserving the local port */ + xs_tcp_reuse_connection(xprt); -/** - * xs_tcp_connect_worker6 - connect a TCP socket to a remote endpoint - * @work: RPC transport to connect - * - * Invoked by a work queue tasklet. - */ -static void xs_tcp_connect_worker6(struct work_struct *work) -{ - struct sock_xprt *transport = - container_of(work, struct sock_xprt, connect_worker.work); - struct rpc_xprt *xprt = &transport->xprt; + dprintk("RPC: worker connecting xprt %p to address: %s\n", + xprt, xprt->address_strings[RPC_DISPLAY_ALL]); - xs_tcp_setup_socket(xprt, transport, xs_create_tcp_sock6); + status = xs_tcp_finish_connecting(xprt, sock); + dprintk("RPC: %p connect status %d connected %d sock state %d\n", + xprt, -status, xprt_connected(xprt), sock->sk->sk_state); + if (status < 0) { + switch (status) { + case -EINPROGRESS: + case -EALREADY: + goto out_clear; + case -ECONNREFUSED: + case -ECONNRESET: + /* retry with existing socket, after a delay */ + break; + default: + /* get rid of existing socket, and retry */ + xs_tcp_shutdown(xprt); + } + } +out: + xprt_wake_pending_tasks(xprt, status); +out_clear: + xprt_clear_connecting(xprt); } /** @@ -1913,6 +1817,9 @@ static void xs_tcp_connect(struct rpc_task *task) { struct rpc_xprt *xprt = task->tk_xprt; + /* Initiate graceful shutdown of the socket if not already done */ + if (test_bit(XPRT_CONNECTED, &xprt->state)) + xs_tcp_shutdown(xprt); /* Exit if we need to wait for socket shutdown to complete */ if (test_bit(XPRT_CLOSING, &xprt->state)) return;