Skip to content

Commit

Permalink
xfs: recovery of swap extents operations for CRC filesystems
Browse files Browse the repository at this point in the history
This is the recovery side of the btree block owner change operation
performed by swapext on CRC enabled filesystems. We detect that an
owner change is needed by the flag that has been placed on the inode
log format flag field. Because the inode recovery is being replayed
after the buffers that make up the BMBT in the given checkpoint, we
can walk all the buffers and directly modify them when we see the
flag set on an inode.

Because the inode can be relogged and hence present in multiple
chekpoints with the "change owner" flag set, we could do multiple
passes across the inode to do this change. While this isn't optimal,
we can't directly ignore the flag as there may be multiple
independent swap extent operations being replayed on the same inode
in different checkpoints so we can't ignore them.

Further, because the owner change operation uses ordered buffers, we
might have buffers that are newer on disk than the current
checkpoint and so already have the owner changed in them. Hence we
cannot just peek at a buffer in the tree and check that it has the
correct owner and assume that the change was completed.

So, for the moment just brute force the owner change every time we
see an inode with the flag set. Note that we have to be careful here
because the owner of the buffers may point to either the old owner
or the new owner. Currently the verifier can't verify the owner
directly, so there is no failure case here right now. If we verify
the owner exactly in future, then we'll have to take this into
account.

This was tested in terms of normal operation via xfstests - all of
the fsr tests now pass without failure. however, we really need to
modify xfs/227 to stress v3 inodes correctly to ensure we fully
cover this case for v5 filesystems.

In terms of recovery testing, I used a hacked version of xfs_fsr
that held the temp inode open for a few seconds before exiting so
that the filesystem could be shut down with an open owner change
recovery flags set on at least the temp inode. fsr leaves the temp
inode unlinked and in btree format, so this was necessary for the
owner change to be reliably replayed.

logprint confirmed the tmp inode in the log had the correct flag set:

INO: cnt:3 total:3 a:0x69e9e0 len:56 a:0x69ea20 len:176 a:0x69eae0 len:88
        INODE: #regs:3   ino:0x44  flags:0x209   dsize:88
	                                 ^^^^^

0x200 is set, indicating a data fork owner change needed to be
replayed on inode 0x44.  A printk in the revoery code confirmed that
the inode change was recovered:

XFS (vdc): Mounting Filesystem
XFS (vdc): Starting recovery (logdev: internal)
recovering owner change ino 0x44
XFS (vdc): Version 5 superblock detected. This kernel L support enabled!
Use of these features in this kernel is at your own risk!
XFS (vdc): Ending recovery (logdev: internal)

The script used to test this was:

$ cat ./recovery-fsr.sh
#!/bin/bash

dev=/dev/vdc
mntpt=/mnt/scratch
testfile=$mntpt/testfile

umount $mntpt
mkfs.xfs -f -m crc=1 $dev
mount $dev $mntpt
chmod 777 $mntpt

for i in `seq 10000 -1 0`; do
        xfs_io -f -d -c "pwrite $(($i * 4096)) 4096" $testfile > /dev/null 2>&1
done
xfs_bmap -vp $testfile |head -20

xfs_fsr -d -v $testfile &
sleep 10
/home/dave/src/xfstests-dev/src/godown -f $mntpt
wait
umount $mntpt

xfs_logprint -t $dev |tail -20
time mount $dev $mntpt
xfs_bmap -vp $testfile
umount $mntpt
$

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
  • Loading branch information
Dave Chinner authored and Ben Myers committed Sep 10, 2013
1 parent 21b5c97 commit 638f441
Show file tree
Hide file tree
Showing 11 changed files with 171 additions and 67 deletions.
26 changes: 18 additions & 8 deletions fs/xfs/xfs_bmap_btree.c
Original file line number Diff line number Diff line change
Expand Up @@ -932,30 +932,40 @@ xfs_bmdr_maxrecs(
* we switch forks between inodes. The operation that the caller is doing will
* determine whether is needs to change owner before or after the switch.
*
* For demand paged modification, the fork switch should be done after reading
* in all the blocks, modifying them and pinning them in the transaction. For
* modification when the buffers are already pinned in memory, the fork switch
* can be done before changing the owner as we won't need to validate the owner
* until the btree buffers are unpinned and writes can occur again.
* For demand paged transactional modification, the fork switch should be done
* after reading in all the blocks, modifying them and pinning them in the
* transaction. For modification when the buffers are already pinned in memory,
* the fork switch can be done before changing the owner as we won't need to
* validate the owner until the btree buffers are unpinned and writes can occur
* again.
*
* For recovery based ownership change, there is no transactional context and
* so a buffer list must be supplied so that we can record the buffers that we
* modified for the caller to issue IO on.
*/
int
xfs_bmbt_change_owner(
struct xfs_trans *tp,
struct xfs_inode *ip,
int whichfork,
xfs_ino_t new_owner)
xfs_ino_t new_owner,
struct list_head *buffer_list)
{
struct xfs_btree_cur *cur;
int error;

ASSERT(tp || buffer_list);
ASSERT(!(tp && buffer_list));
if (whichfork == XFS_DATA_FORK)
ASSERT(ip->i_d.di_format = XFS_DINODE_FMT_BTREE);
else
ASSERT(ip->i_d.di_aformat = XFS_DINODE_FMT_BTREE);

cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork);
error = xfs_btree_change_owner(cur, new_owner);
if (!cur)
return ENOMEM;

error = xfs_btree_change_owner(cur, new_owner, buffer_list);
xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
return error;
}

3 changes: 2 additions & 1 deletion fs/xfs/xfs_bmap_btree.h
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,8 @@ extern int xfs_bmdr_maxrecs(struct xfs_mount *, int blocklen, int leaf);
extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf);

extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip,
int whichfork, xfs_ino_t new_owner);
int whichfork, xfs_ino_t new_owner,
struct list_head *buffer_list);

extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
struct xfs_trans *, struct xfs_inode *, int);
Expand Down
14 changes: 8 additions & 6 deletions fs/xfs/xfs_bmap_util.c
Original file line number Diff line number Diff line change
Expand Up @@ -1932,16 +1932,18 @@ xfs_swap_extents(
target_log_flags = XFS_ILOG_CORE;
if (ip->i_d.di_version == 3 &&
ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
target_log_flags |= XFS_ILOG_OWNER;
error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, tip->i_ino);
target_log_flags |= XFS_ILOG_DOWNER;
error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK,
tip->i_ino, NULL);
if (error)
goto out_trans_cancel;
}

if (tip->i_d.di_version == 3 &&
tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
src_log_flags |= XFS_ILOG_OWNER;
error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK, ip->i_ino);
src_log_flags |= XFS_ILOG_DOWNER;
error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK,
ip->i_ino, NULL);
if (error)
goto out_trans_cancel;
}
Expand Down Expand Up @@ -1997,7 +1999,7 @@ xfs_swap_extents(
break;
case XFS_DINODE_FMT_BTREE:
ASSERT(ip->i_d.di_version < 3 ||
(src_log_flags & XFS_ILOG_OWNER));
(src_log_flags & XFS_ILOG_DOWNER));
src_log_flags |= XFS_ILOG_DBROOT;
break;
}
Expand All @@ -2017,7 +2019,7 @@ xfs_swap_extents(
case XFS_DINODE_FMT_BTREE:
target_log_flags |= XFS_ILOG_DBROOT;
ASSERT(tip->i_d.di_version < 3 ||
(target_log_flags & XFS_ILOG_OWNER));
(target_log_flags & XFS_ILOG_DOWNER));
break;
}

Expand Down
32 changes: 20 additions & 12 deletions fs/xfs/xfs_btree.c
Original file line number Diff line number Diff line change
Expand Up @@ -3907,13 +3907,16 @@ xfs_btree_get_rec(
* buffer as an ordered buffer and log it appropriately. We need to ensure that
* we mark the region we change dirty so that if the buffer is relogged in
* a subsequent transaction the changes we make here as an ordered buffer are
* correctly relogged in that transaction.
* correctly relogged in that transaction. If we are in recovery context, then
* just queue the modified buffer as delayed write buffer so the transaction
* recovery completion writes the changes to disk.
*/
static int
xfs_btree_block_change_owner(
struct xfs_btree_cur *cur,
int level,
__uint64_t new_owner)
__uint64_t new_owner,
struct list_head *buffer_list)
{
struct xfs_btree_block *block;
struct xfs_buf *bp;
Expand All @@ -3930,16 +3933,19 @@ xfs_btree_block_change_owner(
block->bb_u.s.bb_owner = cpu_to_be32(new_owner);

/*
* Log owner change as an ordered buffer. If the block is a root block
* hosted in an inode, we might not have a buffer pointer here and we
* shouldn't attempt to log the change as the information is already
* held in the inode and discarded when the root block is formatted into
* the on-disk inode fork. We still change it, though, so everything is
* consistent in memory.
* If the block is a root block hosted in an inode, we might not have a
* buffer pointer here and we shouldn't attempt to log the change as the
* information is already held in the inode and discarded when the root
* block is formatted into the on-disk inode fork. We still change it,
* though, so everything is consistent in memory.
*/
if (bp) {
xfs_trans_ordered_buf(cur->bc_tp, bp);
xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
if (cur->bc_tp) {
xfs_trans_ordered_buf(cur->bc_tp, bp);
xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
} else {
xfs_buf_delwri_queue(bp, buffer_list);
}
} else {
ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
ASSERT(level == cur->bc_nlevels - 1);
Expand All @@ -3956,7 +3962,8 @@ xfs_btree_block_change_owner(
int
xfs_btree_change_owner(
struct xfs_btree_cur *cur,
__uint64_t new_owner)
__uint64_t new_owner,
struct list_head *buffer_list)
{
union xfs_btree_ptr lptr;
int level;
Expand Down Expand Up @@ -3986,7 +3993,8 @@ xfs_btree_change_owner(
/* for each buffer in the level */
do {
error = xfs_btree_block_change_owner(cur, level,
new_owner);
new_owner,
buffer_list);
} while (!error);

if (error != ENOENT)
Expand Down
3 changes: 2 additions & 1 deletion fs/xfs/xfs_btree.h
Original file line number Diff line number Diff line change
Expand Up @@ -445,7 +445,8 @@ int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *);
int xfs_btree_insert(struct xfs_btree_cur *, int *);
int xfs_btree_delete(struct xfs_btree_cur *, int *);
int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *);
int xfs_btree_change_owner(struct xfs_btree_cur *cur, __uint64_t new_owner);
int xfs_btree_change_owner(struct xfs_btree_cur *cur, __uint64_t new_owner,
struct list_head *buffer_list);

/*
* btree block CRC helpers
Expand Down
4 changes: 2 additions & 2 deletions fs/xfs/xfs_icache.c
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ STATIC void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp,
/*
* Allocate and initialise an xfs_inode.
*/
STATIC struct xfs_inode *
struct xfs_inode *
xfs_inode_alloc(
struct xfs_mount *mp,
xfs_ino_t ino)
Expand Down Expand Up @@ -98,7 +98,7 @@ xfs_inode_free_callback(
kmem_zone_free(xfs_inode_zone, ip);
}

STATIC void
void
xfs_inode_free(
struct xfs_inode *ip)
{
Expand Down
4 changes: 4 additions & 0 deletions fs/xfs/xfs_icache.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@ struct xfs_eofblocks {
int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino,
uint flags, uint lock_flags, xfs_inode_t **ipp);

/* recovery needs direct inode allocation capability */
struct xfs_inode * xfs_inode_alloc(struct xfs_mount *mp, xfs_ino_t ino);
void xfs_inode_free(struct xfs_inode *ip);

void xfs_reclaim_worker(struct work_struct *work);

int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
Expand Down
2 changes: 1 addition & 1 deletion fs/xfs/xfs_inode_buf.c
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ xfs_imap_to_bp(
return 0;
}

STATIC void
void
xfs_dinode_from_disk(
xfs_icdinode_t *to,
xfs_dinode_t *from)
Expand Down
18 changes: 9 additions & 9 deletions fs/xfs/xfs_inode_buf.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,17 +32,17 @@ struct xfs_imap {
ushort im_boffset; /* inode offset in block in bytes */
};

int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *,
struct xfs_imap *, struct xfs_dinode **,
struct xfs_buf **, uint, uint);
int xfs_iread(struct xfs_mount *, struct xfs_trans *,
struct xfs_inode *, uint);
void xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *);
void xfs_dinode_to_disk(struct xfs_dinode *,
struct xfs_icdinode *);
int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *,
struct xfs_imap *, struct xfs_dinode **,
struct xfs_buf **, uint, uint);
int xfs_iread(struct xfs_mount *, struct xfs_trans *,
struct xfs_inode *, uint);
void xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *);
void xfs_dinode_to_disk(struct xfs_dinode *to, struct xfs_icdinode *from);
void xfs_dinode_from_disk(struct xfs_icdinode *to, struct xfs_dinode *from);

#if defined(DEBUG)
void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
#else
#define xfs_inobp_check(mp, bp)
#endif /* DEBUG */
Expand Down
9 changes: 6 additions & 3 deletions fs/xfs/xfs_log_format.h
Original file line number Diff line number Diff line change
Expand Up @@ -474,7 +474,8 @@ typedef struct xfs_inode_log_format_64 {
#define XFS_ILOG_ADATA 0x040 /* log i_af.if_data */
#define XFS_ILOG_AEXT 0x080 /* log i_af.if_extents */
#define XFS_ILOG_ABROOT 0x100 /* log i_af.i_broot */
#define XFS_ILOG_OWNER 0x200 /* change the extent tree owner on replay */
#define XFS_ILOG_DOWNER 0x200 /* change the data fork owner on replay */
#define XFS_ILOG_AOWNER 0x400 /* change the attr fork owner on replay */


/*
Expand All @@ -488,7 +489,8 @@ typedef struct xfs_inode_log_format_64 {
#define XFS_ILOG_NONCORE (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
XFS_ILOG_DBROOT | XFS_ILOG_DEV | \
XFS_ILOG_UUID | XFS_ILOG_ADATA | \
XFS_ILOG_AEXT | XFS_ILOG_ABROOT)
XFS_ILOG_AEXT | XFS_ILOG_ABROOT | \
XFS_ILOG_DOWNER | XFS_ILOG_AOWNER)

#define XFS_ILOG_DFORK (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
XFS_ILOG_DBROOT)
Expand All @@ -500,7 +502,8 @@ typedef struct xfs_inode_log_format_64 {
XFS_ILOG_DEXT | XFS_ILOG_DBROOT | \
XFS_ILOG_DEV | XFS_ILOG_UUID | \
XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
XFS_ILOG_ABROOT | XFS_ILOG_TIMESTAMP)
XFS_ILOG_ABROOT | XFS_ILOG_TIMESTAMP | \
XFS_ILOG_DOWNER | XFS_ILOG_AOWNER)

static inline int xfs_ilog_fbroot(int w)
{
Expand Down
Loading

0 comments on commit 638f441

Please sign in to comment.