Merge branch 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel…

…/git/tytso/ext4 * 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (33 commits) ext4: Regularize mount options ext4: fix locking typo in mballoc which could cause soft lockup hangs ext4: fix typo which causes a memory leak on error path jbd2: Update locking coments ext4: Rename pa_linear to pa_type ext4: add checks of block references for non-extent inodes ext4: Check for an valid i_mode when reading the inode from disk ext4: Use WRITE_SYNC for commits which are caused by fsync() ext4: Add auto_da_alloc mount option ext4: Use struct flex_groups to calculate get_orlov_stats() ext4: Use atomic_t's in struct flex_groups ext4: remove /proc tuning knobs ext4: Add sysfs support ext4: Track lifetime disk writes ext4: Fix discard of inode prealloc space with delayed allocation. ext4: Automatically allocate delay allocated blocks on rename ext4: Automatically allocate delay allocated blocks on close ext4: add EXT4_IOC_ALLOC_DA_BLKS ioctl ext4: Simplify delalloc code by removing mpage_da_writepages() ext4: Save stack space by removing fake buffer heads ...
git-mirror · Apr 1, 2009 · 395d734 · 395d734
2 parents c226fd6 + 06705bf
commit 395d734
Show file tree

Hide file tree

Showing 23 changed files with 1,224 additions and 600 deletions.
diff --git a/Documentation/ABI/testing/sysfs-fs-ext4 b/Documentation/ABI/testing/sysfs-fs-ext4
@@ -0,0 +1,81 @@
+What:		/sys/fs/ext4/<disk>/mb_stats
+Date:		March 2008
+Contact:	"Theodore Ts'o" <tytso@mit.edu>
+Description:
+		 Controls whether the multiblock allocator should
+		 collect statistics, which are shown during the unmount.
+		 1 means to collect statistics, 0 means not to collect
+		 statistics
+
+What:		/sys/fs/ext4/<disk>/mb_group_prealloc
+Date:		March 2008
+Contact:	"Theodore Ts'o" <tytso@mit.edu>
+Description:
+		The multiblock allocator will round up allocation
+		requests to a multiple of this tuning parameter if the
+		stripe size is not set in the ext4 superblock
+
+What:		/sys/fs/ext4/<disk>/mb_max_to_scan
+Date:		March 2008
+Contact:	"Theodore Ts'o" <tytso@mit.edu>
+Description:
+		The maximum number of extents the multiblock allocator
+		will search to find the best extent
+
+What:		/sys/fs/ext4/<disk>/mb_min_to_scan
+Date:		March 2008
+Contact:	"Theodore Ts'o" <tytso@mit.edu>
+Description:
+		The minimum number of extents the multiblock allocator
+		will search to find the best extent
+
+What:		/sys/fs/ext4/<disk>/mb_order2_req
+Date:		March 2008
+Contact:	"Theodore Ts'o" <tytso@mit.edu>
+Description:
+		Tuning parameter which controls the minimum size for 
+		requests (as a power of 2) where the buddy cache is
+		used
+
+What:		/sys/fs/ext4/<disk>/mb_stream_req
+Date:		March 2008
+Contact:	"Theodore Ts'o" <tytso@mit.edu>
+Description:
+		Files which have fewer blocks than this tunable
+		parameter will have their blocks allocated out of a
+		block group specific preallocation pool, so that small
+		files are packed closely together.  Each large file
+		 will have its blocks allocated out of its own unique
+		 preallocation pool.
+
+What:		/sys/fs/ext4/<disk>/inode_readahead
+Date:		March 2008
+Contact:	"Theodore Ts'o" <tytso@mit.edu>
+Description:
+		Tuning parameter which controls the maximum number of
+		inode table blocks that ext4's inode table readahead
+		algorithm will pre-read into the buffer cache
+
+What:		/sys/fs/ext4/<disk>/delayed_allocation_blocks
+Date:		March 2008
+Contact:	"Theodore Ts'o" <tytso@mit.edu>
+Description:
+		This file is read-only and shows the number of blocks
+		that are dirty in the page cache, but which do not
+		have their location in the filesystem allocated yet.
+
+What:		/sys/fs/ext4/<disk>/lifetime_write_kbytes
+Date:		March 2008
+Contact:	"Theodore Ts'o" <tytso@mit.edu>
+Description:
+		This file is read-only and shows the number of kilobytes
+		of data that have been written to this filesystem since it was
+		created.
+
+What:		/sys/fs/ext4/<disk>/session_write_kbytes
+Date:		March 2008
+Contact:	"Theodore Ts'o" <tytso@mit.edu>
+Description:
+		This file is read-only and shows the number of
+		kilobytes of data that have been written to this
+		filesystem since it was mounted.
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
@@ -85,7 +85,7 @@ Note: More extensive information for getting started with ext4 can be
 * extent format more robust in face of on-disk corruption due to magics,
 * internal redundancy in tree
 * improved file allocation (multi-block alloc)
-* fix 32000 subdirectory limit
+* lift 32000 subdirectory limit imposed by i_links_count[1]
 * nsec timestamps for mtime, atime, ctime, create time
 * inode version field on disk (NFSv4, Lustre)
 * reduced e2fsck time via uninit_bg feature
@@ -100,6 +100,9 @@ Note: More extensive information for getting started with ext4 can be
 * efficent new ordered mode in JBD2 and ext4(avoid using buffer head to force
   the ordering)
 
+[1] Filesystems with a block size of 1k may see a limit imposed by the
+directory hash tree having a maximum depth of two.
+
 2.2 Candidate features for future inclusion
 
 * Online defrag (patches available but not well tested)
@@ -180,15 +183,18 @@ commit=nrsec	(*)	Ext4 can be told to sync all its data and metadata
 			performance.
 
 barrier=<0|1(*)>	This enables/disables the use of write barriers in
-			the jbd code.  barrier=0 disables, barrier=1 enables.
-			This also requires an IO stack which can support
+barrier(*)		the jbd code.  barrier=0 disables, barrier=1 enables.
+nobarrier		This also requires an IO stack which can support
 			barriers, and if jbd gets an error on a barrier
 			write, it will disable again with a warning.
 			Write barriers enforce proper on-disk ordering
 			of journal commits, making volatile disk write caches
 			safe to use, at some performance penalty.  If
 			your disks are battery-backed in one way or another,
 			disabling barriers may safely improve performance.
+			The mount options "barrier" and "nobarrier" can
+			also be used to enable or disable barriers, for
+			consistency with other ext4 mount options.
 
 inode_readahead=n	This tuning parameter controls the maximum
 			number of inode table blocks that ext4's inode
@@ -310,6 +316,24 @@ journal_ioprio=prio	The I/O priority (from 0 to 7, where 0 is the
 			a slightly higher priority than the default I/O
 			priority.
 
+auto_da_alloc(*)	Many broken applications don't use fsync() when 
+noauto_da_alloc		replacing existing files via patterns such as
+			fd = open("foo.new")/write(fd,..)/close(fd)/
+			rename("foo.new", "foo"), or worse yet,
+			fd = open("foo", O_TRUNC)/write(fd,..)/close(fd).
+			If auto_da_alloc is enabled, ext4 will detect
+			the replace-via-rename and replace-via-truncate
+			patterns and force that any delayed allocation
+			blocks are allocated such that at the next
+			journal commit, in the default data=ordered
+			mode, the data blocks of the new file are forced
+			to disk before the rename() operation is
+			commited.  This provides roughly the same level
+			of guarantees as ext3, and avoids the
+			"zero-length" problem that can happen when a
+			system crashes before the delayed allocation
+			blocks are forced to disk.
+
 Data Mode
 =========
 There are 3 different data modes:

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
@@ -940,27 +940,6 @@ Table 1-10: Files in /proc/fs/ext4/<devname>
  File            Content                                        
  mb_groups       details of multiblock allocator buddy cache of free blocks
  mb_history      multiblock allocation history
- stats           controls whether the multiblock allocator should start
-                 collecting statistics, which are shown during the unmount
- group_prealloc  the multiblock allocator will round up allocation
-                 requests to a multiple of this tuning parameter if the
-                 stripe size is not set in the ext4 superblock
- max_to_scan     The maximum number of extents the multiblock allocator
-                 will search to find the best extent
- min_to_scan     The minimum number of extents the multiblock allocator
-                 will search to find the best extent
- order2_req      Tuning parameter which controls the minimum size for 
-                 requests (as a power of 2) where the buddy cache is
-                 used
- stream_req      Files which have fewer blocks than this tunable
-                 parameter will have their blocks allocated out of a
-                 block group specific preallocation pool, so that small
-                 files are packed closely together.  Each large file
-                 will have its blocks allocated out of its own unique
-                 preallocation pool.
-inode_readahead  Tuning parameter which controls the maximum number of
-                 inode table blocks that ext4's inode table readahead
-                 algorithm will pre-read into the buffer cache
 ..............................................................................
 
 

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
@@ -55,18 +55,15 @@ static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block,
 }
 
 static int ext4_group_used_meta_blocks(struct super_block *sb,
-				ext4_group_t block_group)
+				       ext4_group_t block_group,
+				       struct ext4_group_desc *gdp)
 {
 	ext4_fsblk_t tmp;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	/* block bitmap, inode bitmap, and inode table blocks */
 	int used_blocks = sbi->s_itb_per_group + 2;
 
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
-		struct ext4_group_desc *gdp;
-		struct buffer_head *bh;
-
-		gdp = ext4_get_group_desc(sb, block_group, &bh);
 		if (!ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp),
 					block_group))
 			used_blocks--;
@@ -177,7 +174,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
 		 */
 		mark_bitmap_end(group_blocks, sb->s_blocksize * 8, bh->b_data);
 	}
-	return free_blocks - ext4_group_used_meta_blocks(sb, block_group);
+	return free_blocks - ext4_group_used_meta_blocks(sb, block_group, gdp);
 }
 
 
@@ -473,9 +470,8 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
 
 	if (sbi->s_log_groups_per_flex) {
 		ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
-		spin_lock(sb_bgl_lock(sbi, flex_group));
-		sbi->s_flex_groups[flex_group].free_blocks += blocks_freed;
-		spin_unlock(sb_bgl_lock(sbi, flex_group));
+		atomic_add(blocks_freed,
+			   &sbi->s_flex_groups[flex_group].free_blocks);
 	}
 	/*
 	 * request to reload the buddy with the

diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
@@ -67,7 +67,8 @@ int ext4_check_dir_entry(const char *function, struct inode *dir,
 			 unsigned int offset)
 {
 	const char *error_msg = NULL;
-	const int rlen = ext4_rec_len_from_disk(de->rec_len);
+	const int rlen = ext4_rec_len_from_disk(de->rec_len,
+						dir->i_sb->s_blocksize);
 
 	if (rlen < EXT4_DIR_REC_LEN(1))
 		error_msg = "rec_len is smaller than minimal";
@@ -178,10 +179,11 @@ static int ext4_readdir(struct file *filp,
 				 * least that it is non-zero.  A
 				 * failure will be detected in the
 				 * dirent test below. */
-				if (ext4_rec_len_from_disk(de->rec_len)
-						< EXT4_DIR_REC_LEN(1))
+				if (ext4_rec_len_from_disk(de->rec_len,
+					sb->s_blocksize) < EXT4_DIR_REC_LEN(1))
 					break;
-				i += ext4_rec_len_from_disk(de->rec_len);
+				i += ext4_rec_len_from_disk(de->rec_len,
+							    sb->s_blocksize);
 			}
 			offset = i;
 			filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
@@ -203,7 +205,8 @@ static int ext4_readdir(struct file *filp,
 				ret = stored;
 				goto out;
 			}
-			offset += ext4_rec_len_from_disk(de->rec_len);
+			offset += ext4_rec_len_from_disk(de->rec_len,
+					sb->s_blocksize);
 			if (le32_to_cpu(de->inode)) {
 				/* We might block in the next section
 				 * if the data destination is
@@ -225,7 +228,8 @@ static int ext4_readdir(struct file *filp,
 					goto revalidate;
 				stored++;
 			}
-			filp->f_pos += ext4_rec_len_from_disk(de->rec_len);
+			filp->f_pos += ext4_rec_len_from_disk(de->rec_len,
+						sb->s_blocksize);
 		}
 		offset = 0;
 		brelse(bh);