From 757a69ef6cf2bf839bd4088e5609ddddd663b0c4 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Tue, 8 Aug 2017 18:19:47 -0700 Subject: [PATCH 01/44] xfs: write unmount record for ro mounts There are dueling comments in the xfs code about intent for log writes when unmounting a readonly filesystem. In xfs_mountfs, we see the intent: /* * Now the log is fully replayed, we can transition to full read-only * mode for read-only mounts. This will sync all the metadata and clean * the log so that the recovery we just performed does not have to be * replayed again on the next mount. */ and it calls xfs_quiesce_attr(), but by the time we get to xfs_log_unmount_write(), it returns early for a RDONLY mount: * Don't write out unmount record on read-only mounts. Because of this, sequential ro mounts of a filesystem with a dirty log will replay the log each time, which seems odd. Fix this by writing an unmount record even for RO mounts, as long as norecovery wasn't specified (don't write a clean log record if a dirty log may still be there!) and the log device is writable. Signed-off-by: Eric Sandeen Reviewed-by: Brian Foster Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_log.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 4ebd0bafc914c..972eda87db2ba 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -812,11 +812,14 @@ xfs_log_unmount_write(xfs_mount_t *mp) int error; /* - * Don't write out unmount record on read-only mounts. + * Don't write out unmount record on norecovery mounts or ro devices. * Or, if we are doing a forced umount (typically because of IO errors). */ - if (mp->m_flags & XFS_MOUNT_RDONLY) + if (mp->m_flags & XFS_MOUNT_NORECOVERY || + xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) { + ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); return 0; + } error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL); ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log))); From 6f4a1eefdd0ad4561543270a7fceadabcca075dd Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Tue, 8 Aug 2017 18:21:49 -0700 Subject: [PATCH 02/44] xfs: toggle readonly state around xfs_log_mount_finish When we do log recovery on a readonly mount, unlinked inode processing does not happen due to the readonly checks in xfs_inactive(), which are trying to prevent any I/O on a readonly mount. This is misguided - we do I/O on readonly mounts all the time, for consistency; for example, log recovery. So do the same RDONLY flag twiddling around xfs_log_mount_finish() as we do around xfs_log_mount(), for the same reason. This all cries out for a big rework but for now this is a simple fix to an obvious problem. Signed-off-by: Eric Sandeen Reviewed-by: Brian Foster Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_log.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 972eda87db2ba..f467f1230444f 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -743,10 +743,14 @@ xfs_log_mount_finish( struct xfs_mount *mp) { int error = 0; + bool readonly = (mp->m_flags & XFS_MOUNT_RDONLY); if (mp->m_flags & XFS_MOUNT_NORECOVERY) { ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); return 0; + } else if (readonly) { + /* Allow unlinked processing to proceed */ + mp->m_flags &= ~XFS_MOUNT_RDONLY; } /* @@ -764,6 +768,9 @@ xfs_log_mount_finish( xfs_log_work_queue(mp); mp->m_super->s_flags &= ~MS_ACTIVE; + if (readonly) + mp->m_flags |= XFS_MOUNT_RDONLY; + return error; } From 0b80ae6ed13169bd3a244e71169f2cc020b0c57a Mon Sep 17 00:00:00 2001 From: Carlos Maiolino Date: Tue, 8 Aug 2017 18:21:50 -0700 Subject: [PATCH 03/44] xfs: Add infrastructure needed for error propagation during buffer IO failure With the current code, XFS never re-submit a failed buffer for IO, because the failed item in the buffer is kept in the flush locked state forever. To be able to resubmit an log item for IO, we need a way to mark an item as failed, if, for any reason the buffer which the item belonged to failed during writeback. Add a new log item callback to be used after an IO completion failure and make the needed clean ups. Reviewed-by: Brian Foster Signed-off-by: Carlos Maiolino Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_buf_item.c | 32 +++++++++++++++++++++++++++++++- fs/xfs/xfs_trans.h | 7 +++++-- 2 files changed, 36 insertions(+), 3 deletions(-) diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index f6a8422e9562c..7573a1f0bc9a8 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -29,6 +29,7 @@ #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_log.h" +#include "xfs_inode.h" kmem_zone_t *xfs_buf_item_zone; @@ -1054,6 +1055,31 @@ xfs_buf_do_callbacks( } } +/* + * Invoke the error state callback for each log item affected by the failed I/O. + * + * If a metadata buffer write fails with a non-permanent error, the buffer is + * eventually resubmitted and so the completion callbacks are not run. The error + * state may need to be propagated to the log items attached to the buffer, + * however, so the next AIL push of the item knows hot to handle it correctly. + */ +STATIC void +xfs_buf_do_callbacks_fail( + struct xfs_buf *bp) +{ + struct xfs_log_item *next; + struct xfs_log_item *lip = bp->b_fspriv; + struct xfs_ail *ailp = lip->li_ailp; + + spin_lock(&ailp->xa_lock); + for (; lip; lip = next) { + next = lip->li_bio_list; + if (lip->li_ops->iop_error) + lip->li_ops->iop_error(lip, bp); + } + spin_unlock(&ailp->xa_lock); +} + static bool xfs_buf_iodone_callback_error( struct xfs_buf *bp) @@ -1123,7 +1149,11 @@ xfs_buf_iodone_callback_error( if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount) goto permanent_error; - /* still a transient error, higher layers will retry */ + /* + * Still a transient error, run IO completion failure callbacks and let + * the higher layers retry the buffer. + */ + xfs_buf_do_callbacks_fail(bp); xfs_buf_ioerror(bp, 0); xfs_buf_relse(bp); return true; diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 6bdad6f589345..442d679210a18 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -64,11 +64,13 @@ typedef struct xfs_log_item { } xfs_log_item_t; #define XFS_LI_IN_AIL 0x1 -#define XFS_LI_ABORTED 0x2 +#define XFS_LI_ABORTED 0x2 +#define XFS_LI_FAILED 0x4 #define XFS_LI_FLAGS \ { XFS_LI_IN_AIL, "IN_AIL" }, \ - { XFS_LI_ABORTED, "ABORTED" } + { XFS_LI_ABORTED, "ABORTED" }, \ + { XFS_LI_FAILED, "FAILED" } struct xfs_item_ops { void (*iop_size)(xfs_log_item_t *, int *, int *); @@ -79,6 +81,7 @@ struct xfs_item_ops { void (*iop_unlock)(xfs_log_item_t *); xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t); void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t); + void (*iop_error)(xfs_log_item_t *, xfs_buf_t *); }; void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item, From d3a304b6292168b83b45d624784f973fdc1ca674 Mon Sep 17 00:00:00 2001 From: Carlos Maiolino Date: Tue, 8 Aug 2017 18:21:50 -0700 Subject: [PATCH 04/44] xfs: Properly retry failed inode items in case of error during buffer writeback When a buffer has been failed during writeback, the inode items into it are kept flush locked, and are never resubmitted due the flush lock, so, if any buffer fails to be written, the items in AIL are never written to disk and never unlocked. This causes unmount operation to hang due these items flush locked in AIL, but this also causes the items in AIL to never be written back, even when the IO device comes back to normal. I've been testing this patch with a DM-thin device, creating a filesystem larger than the real device. When writing enough data to fill the DM-thin device, XFS receives ENOSPC errors from the device, and keep spinning on xfsaild (when 'retry forever' configuration is set). At this point, the filesystem can not be unmounted because of the flush locked items in AIL, but worse, the items in AIL are never retried at all (once xfs_inode_item_push() will skip the items that are flush locked), even if the underlying DM-thin device is expanded to the proper size. This patch fixes both cases, retrying any item that has been failed previously, using the infra-structure provided by the previous patch. Reviewed-by: Brian Foster Signed-off-by: Carlos Maiolino Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_buf_item.c | 28 ++++++++++++++++++++++++ fs/xfs/xfs_buf_item.h | 3 +++ fs/xfs/xfs_inode_item.c | 47 +++++++++++++++++++++++++++++++++++++---- fs/xfs/xfs_trans.h | 1 + fs/xfs/xfs_trans_ail.c | 3 ++- fs/xfs/xfs_trans_priv.h | 31 +++++++++++++++++++++++++++ 6 files changed, 108 insertions(+), 5 deletions(-) diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 7573a1f0bc9a8..573fc72c3f23b 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -1234,3 +1234,31 @@ xfs_buf_iodone( xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE); xfs_buf_item_free(BUF_ITEM(lip)); } + +/* + * Requeue a failed buffer for writeback + * + * Return true if the buffer has been re-queued properly, false otherwise + */ +bool +xfs_buf_resubmit_failed_buffers( + struct xfs_buf *bp, + struct xfs_log_item *lip, + struct list_head *buffer_list) +{ + struct xfs_log_item *next; + + /* + * Clear XFS_LI_FAILED flag from all items before resubmit + * + * XFS_LI_FAILED set/clear is protected by xa_lock, caller this + * function already have it acquired + */ + for (; lip; lip = next) { + next = lip->li_bio_list; + xfs_clear_li_failed(lip); + } + + /* Add this buffer back to the delayed write list */ + return xfs_buf_delwri_queue(bp, buffer_list); +} diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index f7eba99d19dde..530686e1afb94 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -70,6 +70,9 @@ void xfs_buf_attach_iodone(struct xfs_buf *, xfs_log_item_t *); void xfs_buf_iodone_callbacks(struct xfs_buf *); void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *); +bool xfs_buf_resubmit_failed_buffers(struct xfs_buf *, + struct xfs_log_item *, + struct list_head *); extern kmem_zone_t *xfs_buf_item_zone; diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 013cc78d7daf4..6d0f74ec31e89 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -27,6 +27,7 @@ #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_trans_priv.h" +#include "xfs_buf_item.h" #include "xfs_log.h" @@ -475,6 +476,23 @@ xfs_inode_item_unpin( wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT); } +/* + * Callback used to mark a buffer with XFS_LI_FAILED when items in the buffer + * have been failed during writeback + * + * This informs the AIL that the inode is already flush locked on the next push, + * and acquires a hold on the buffer to ensure that it isn't reclaimed before + * dirty data makes it to disk. + */ +STATIC void +xfs_inode_item_error( + struct xfs_log_item *lip, + struct xfs_buf *bp) +{ + ASSERT(xfs_isiflocked(INODE_ITEM(lip)->ili_inode)); + xfs_set_li_failed(lip, bp); +} + STATIC uint xfs_inode_item_push( struct xfs_log_item *lip, @@ -484,13 +502,28 @@ xfs_inode_item_push( { struct xfs_inode_log_item *iip = INODE_ITEM(lip); struct xfs_inode *ip = iip->ili_inode; - struct xfs_buf *bp = NULL; + struct xfs_buf *bp = lip->li_buf; uint rval = XFS_ITEM_SUCCESS; int error; if (xfs_ipincount(ip) > 0) return XFS_ITEM_PINNED; + /* + * The buffer containing this item failed to be written back + * previously. Resubmit the buffer for IO. + */ + if (lip->li_flags & XFS_LI_FAILED) { + if (!xfs_buf_trylock(bp)) + return XFS_ITEM_LOCKED; + + if (!xfs_buf_resubmit_failed_buffers(bp, lip, buffer_list)) + rval = XFS_ITEM_FLUSHING; + + xfs_buf_unlock(bp); + return rval; + } + if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) return XFS_ITEM_LOCKED; @@ -622,7 +655,8 @@ static const struct xfs_item_ops xfs_inode_item_ops = { .iop_unlock = xfs_inode_item_unlock, .iop_committed = xfs_inode_item_committed, .iop_push = xfs_inode_item_push, - .iop_committing = xfs_inode_item_committing + .iop_committing = xfs_inode_item_committing, + .iop_error = xfs_inode_item_error }; @@ -710,7 +744,8 @@ xfs_iflush_done( * the AIL lock. */ iip = INODE_ITEM(blip); - if (iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn) + if ((iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn) || + lip->li_flags & XFS_LI_FAILED) need_ail++; blip = next; @@ -718,7 +753,8 @@ xfs_iflush_done( /* make sure we capture the state of the initial inode. */ iip = INODE_ITEM(lip); - if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) + if ((iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) || + lip->li_flags & XFS_LI_FAILED) need_ail++; /* @@ -739,6 +775,9 @@ xfs_iflush_done( if (INODE_ITEM(blip)->ili_logged && blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn) mlip_changed |= xfs_ail_delete_one(ailp, blip); + else { + xfs_clear_li_failed(blip); + } } if (mlip_changed) { diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 442d679210a18..7d627721e4b31 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -49,6 +49,7 @@ typedef struct xfs_log_item { struct xfs_ail *li_ailp; /* ptr to AIL */ uint li_type; /* item type */ uint li_flags; /* misc flags */ + struct xfs_buf *li_buf; /* real buffer pointer */ struct xfs_log_item *li_bio_list; /* buffer item list */ void (*li_cb)(struct xfs_buf *, struct xfs_log_item *); diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 9056c0f34a3c4..70f5ab0173236 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -687,12 +687,13 @@ xfs_trans_ail_update_bulk( bool xfs_ail_delete_one( struct xfs_ail *ailp, - struct xfs_log_item *lip) + struct xfs_log_item *lip) { struct xfs_log_item *mlip = xfs_ail_min(ailp); trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn); xfs_ail_delete(ailp, lip); + xfs_clear_li_failed(lip); lip->li_flags &= ~XFS_LI_IN_AIL; lip->li_lsn = 0; diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index d91706c56c633..b317a3644c006 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -164,4 +164,35 @@ xfs_trans_ail_copy_lsn( *dst = *src; } #endif + +static inline void +xfs_clear_li_failed( + struct xfs_log_item *lip) +{ + struct xfs_buf *bp = lip->li_buf; + + ASSERT(lip->li_flags & XFS_LI_IN_AIL); + lockdep_assert_held(&lip->li_ailp->xa_lock); + + if (lip->li_flags & XFS_LI_FAILED) { + lip->li_flags &= ~XFS_LI_FAILED; + lip->li_buf = NULL; + xfs_buf_rele(bp); + } +} + +static inline void +xfs_set_li_failed( + struct xfs_log_item *lip, + struct xfs_buf *bp) +{ + lockdep_assert_held(&lip->li_ailp->xa_lock); + + if (!(lip->li_flags & XFS_LI_FAILED)) { + xfs_buf_hold(bp); + lip->li_flags |= XFS_LI_FAILED; + lip->li_buf = bp; + } +} + #endif /* __XFS_TRANS_PRIV_H__ */ From 284f1c2c9bebf871861184b0e2c40fa921dd380b Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 8 Aug 2017 18:21:51 -0700 Subject: [PATCH 05/44] xfs: fix recovery failure when log record header wraps log end The high-level log recovery algorithm consists of two loops that walk the physical log and process log records from the tail to the head. The first loop handles the case where the tail is beyond the head and processes records up to the end of the physical log. The subsequent loop processes records from the beginning of the physical log to the head. Because log records can wrap around the end of the physical log, the first loop mentioned above must handle this case appropriately. Records are processed from in-core buffers, which means that this algorithm must split the reads of such records into two partial I/Os: 1.) from the beginning of the record to the end of the log and 2.) from the beginning of the log to the end of the record. This is further complicated by the fact that the log record header and log record data are read into independent buffers. The current handling of each buffer correctly splits the reads when either the header or data starts before the end of the log and wraps around the end. The data read does not correctly handle the case where the prior header read wrapped or ends on the physical log end boundary. blk_no is incremented to or beyond the log end after the header read to point to the record data, but the split data read logic triggers, attempts to read from an invalid log block and ultimately causes log recovery to fail. This can be reproduced fairly reliably via xfstests tests generic/047 and generic/388 with large iclog sizes (256k) and small (10M) logs. If the record header read has pushed beyond the end of the physical log, the subsequent data read is actually contiguous. Update the data read logic to detect the case where blk_no has wrapped, mod it against the log size to read from the correct address and issue one contiguous read for the log data buffer. The log record is processed as normal from the buffer(s), the loop exits after the current iteration and the subsequent loop picks up with the first new record after the start of the log. Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_log_recover.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 9549188f5a36d..36a179f2c9318 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -5218,7 +5218,7 @@ xlog_do_recovery_pass( xfs_daddr_t *first_bad) /* out: first bad log rec */ { xlog_rec_header_t *rhead; - xfs_daddr_t blk_no; + xfs_daddr_t blk_no, rblk_no; xfs_daddr_t rhead_blk; char *offset; xfs_buf_t *hbp, *dbp; @@ -5371,9 +5371,19 @@ xlog_do_recovery_pass( bblks = (int)BTOBB(be32_to_cpu(rhead->h_len)); blk_no += hblks; - /* Read in data for log record */ - if (blk_no + bblks <= log->l_logBBsize) { - error = xlog_bread(log, blk_no, bblks, dbp, + /* + * Read the log record data in multiple reads if it + * wraps around the end of the log. Note that if the + * header already wrapped, blk_no could point past the + * end of the log. The record data is contiguous in + * that case. + */ + if (blk_no + bblks <= log->l_logBBsize || + blk_no >= log->l_logBBsize) { + /* mod blk_no in case the header wrapped and + * pushed it beyond the end of the log */ + rblk_no = do_mod(blk_no, log->l_logBBsize); + error = xlog_bread(log, rblk_no, bblks, dbp, &offset); if (error) goto bread_err2; From 5297ac1f6d7cbf45464a49b9558831f271dfc559 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 8 Aug 2017 18:21:51 -0700 Subject: [PATCH 06/44] xfs: always verify the log tail during recovery Log tail verification currently only occurs when torn writes are detected at the head of the log. This was introduced because a change in the head block due to torn writes can lead to a change in the tail block (each log record header references the current tail) and the tail block should be verified before log recovery proceeds. Tail corruption is possible outside of torn write scenarios, however. For example, partial log writes can be detected and cleared during the initial head/tail block discovery process. If the partial write coincides with a tail overwrite, the log tail is corrupted and recovery fails. To facilitate correct handling of log tail overwites, update log recovery to always perform tail verification. This is necessary to detect potential tail overwrite conditions when torn writes may not have occurred. This changes normal (i.e., no torn writes) recovery behavior slightly to detect and return CRC related errors near the tail before actual recovery starts. Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_log_recover.c | 26 +++----------------------- 1 file changed, 3 insertions(+), 23 deletions(-) diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 36a179f2c9318..c1337127d8fbe 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1183,31 +1183,11 @@ xlog_verify_head( ASSERT(0); return 0; } - - /* - * Now verify the tail based on the updated head. This is - * required because the torn writes trimmed from the head could - * have been written over the tail of a previous record. Return - * any errors since recovery cannot proceed if the tail is - * corrupt. - * - * XXX: This leaves a gap in truly robust protection from torn - * writes in the log. If the head is behind the tail, the tail - * pushes forward to create some space and then a crash occurs - * causing the writes into the previous record's tail region to - * tear, log recovery isn't able to recover. - * - * How likely is this to occur? If possible, can we do something - * more intelligent here? Is it safe to push the tail forward if - * we can determine that the tail is within the range of the - * torn write (e.g., the kernel can only overwrite the tail if - * it has actually been pushed forward)? Alternatively, could we - * somehow prevent this condition at runtime? - */ - error = xlog_verify_tail(log, *head_blk, *tail_blk); } + if (error) + return error; - return error; + return xlog_verify_tail(log, *head_blk, *tail_blk); } /* From 4a4f66eac4681378996a1837ad1ffec3a2e2981f Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 8 Aug 2017 18:21:52 -0700 Subject: [PATCH 07/44] xfs: fix log recovery corruption error due to tail overwrite If we consider the case where the tail (T) of the log is pinned long enough for the head (H) to push and block behind the tail, we can end up blocked in the following state without enough free space (f) in the log to satisfy a transaction reservation: 0 phys. log N [-------HffT---H'--T'---] The last good record in the log (before H) refers to T. The tail eventually pushes forward (T') leaving more free space in the log for writes to H. At this point, suppose space frees up in the log for the maximum of 8 in-core log buffers to start flushing out to the log. If this pushes the head from H to H', these next writes overwrite the previous tail T. This is safe because the items logged from T to T' have been written back and removed from the AIL. If the next log writes (H -> H') happen to fail and result in partial records in the log, the filesystem shuts down having overwritten T with invalid data. Log recovery correctly locates H on the subsequent mount, but H still refers to the now corrupted tail T. This results in log corruption errors and recovery failure. Since the tail overwrite results from otherwise correct runtime behavior, it is up to log recovery to try and deal with this situation. Update log recovery tail verification to run a CRC pass from the first record past the tail to the head. This facilitates error detection at T and moves the recovery tail to the first good record past H' (similar to truncating the head on torn write detection). If corruption is detected beyond the range possibly affected by the max number of iclogs, the log is legitimately corrupted and log recovery failure is expected. Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_log_recover.c | 108 ++++++++++++++++++++++++++++----------- 1 file changed, 77 insertions(+), 31 deletions(-) diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index c1337127d8fbe..a5e2ca8f5cd63 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1029,61 +1029,106 @@ xlog_seek_logrec_hdr( } /* - * Check the log tail for torn writes. This is required when torn writes are - * detected at the head and the head had to be walked back to a previous record. - * The tail of the previous record must now be verified to ensure the torn - * writes didn't corrupt the previous tail. + * Calculate distance from head to tail (i.e., unused space in the log). + */ +static inline int +xlog_tail_distance( + struct xlog *log, + xfs_daddr_t head_blk, + xfs_daddr_t tail_blk) +{ + if (head_blk < tail_blk) + return tail_blk - head_blk; + + return tail_blk + (log->l_logBBsize - head_blk); +} + +/* + * Verify the log tail. This is particularly important when torn or incomplete + * writes have been detected near the front of the log and the head has been + * walked back accordingly. + * + * We also have to handle the case where the tail was pinned and the head + * blocked behind the tail right before a crash. If the tail had been pushed + * immediately prior to the crash and the subsequent checkpoint was only + * partially written, it's possible it overwrote the last referenced tail in the + * log with garbage. This is not a coherency problem because the tail must have + * been pushed before it can be overwritten, but appears as log corruption to + * recovery because we have no way to know the tail was updated if the + * subsequent checkpoint didn't write successfully. * - * Return an error if CRC verification fails as recovery cannot proceed. + * Therefore, CRC check the log from tail to head. If a failure occurs and the + * offending record is within max iclog bufs from the head, walk the tail + * forward and retry until a valid tail is found or corruption is detected out + * of the range of a possible overwrite. */ STATIC int xlog_verify_tail( struct xlog *log, xfs_daddr_t head_blk, - xfs_daddr_t tail_blk) + xfs_daddr_t *tail_blk, + int hsize) { struct xlog_rec_header *thead; struct xfs_buf *bp; xfs_daddr_t first_bad; - int count; int error = 0; bool wrapped; - xfs_daddr_t tmp_head; + xfs_daddr_t tmp_tail; + xfs_daddr_t orig_tail = *tail_blk; bp = xlog_get_bp(log, 1); if (!bp) return -ENOMEM; /* - * Seek XLOG_MAX_ICLOGS + 1 records past the current tail record to get - * a temporary head block that points after the last possible - * concurrently written record of the tail. + * Make sure the tail points to a record (returns positive count on + * success). */ - count = xlog_seek_logrec_hdr(log, head_blk, tail_blk, - XLOG_MAX_ICLOGS + 1, bp, &tmp_head, &thead, - &wrapped); - if (count < 0) { - error = count; + error = xlog_seek_logrec_hdr(log, head_blk, *tail_blk, 1, bp, + &tmp_tail, &thead, &wrapped); + if (error < 0) goto out; - } + if (*tail_blk != tmp_tail) + *tail_blk = tmp_tail; /* - * If the call above didn't find XLOG_MAX_ICLOGS + 1 records, we ran - * into the actual log head. tmp_head points to the start of the record - * so update it to the actual head block. + * Run a CRC check from the tail to the head. We can't just check + * MAX_ICLOGS records past the tail because the tail may point to stale + * blocks cleared during the search for the head/tail. These blocks are + * overwritten with zero-length records and thus record count is not a + * reliable indicator of the iclog state before a crash. */ - if (count < XLOG_MAX_ICLOGS + 1) - tmp_head = head_blk; - - /* - * We now have a tail and temporary head block that covers at least - * XLOG_MAX_ICLOGS records from the tail. We need to verify that these - * records were completely written. Run a CRC verification pass from - * tail to head and return the result. - */ - error = xlog_do_recovery_pass(log, tmp_head, tail_blk, + first_bad = 0; + error = xlog_do_recovery_pass(log, head_blk, *tail_blk, XLOG_RECOVER_CRCPASS, &first_bad); + while (error == -EFSBADCRC && first_bad) { + int tail_distance; + + /* + * Is corruption within range of the head? If so, retry from + * the next record. Otherwise return an error. + */ + tail_distance = xlog_tail_distance(log, head_blk, first_bad); + if (tail_distance > BTOBB(XLOG_MAX_ICLOGS * hsize)) + break; + /* skip to the next record; returns positive count on success */ + error = xlog_seek_logrec_hdr(log, head_blk, first_bad, 2, bp, + &tmp_tail, &thead, &wrapped); + if (error < 0) + goto out; + + *tail_blk = tmp_tail; + first_bad = 0; + error = xlog_do_recovery_pass(log, head_blk, *tail_blk, + XLOG_RECOVER_CRCPASS, &first_bad); + } + + if (!error && *tail_blk != orig_tail) + xfs_warn(log->l_mp, + "Tail block (0x%llx) overwrite detected. Updated to 0x%llx", + orig_tail, *tail_blk); out: xlog_put_bp(bp); return error; @@ -1187,7 +1232,8 @@ xlog_verify_head( if (error) return error; - return xlog_verify_tail(log, *head_blk, *tail_blk); + return xlog_verify_tail(log, *head_blk, tail_blk, + be32_to_cpu((*rhead)->h_size)); } /* From 7f4d01f36a3ac16f539f0fd3839de5d58fa4940f Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 8 Aug 2017 18:21:52 -0700 Subject: [PATCH 08/44] xfs: add log item pinning error injection tag Add an error injection tag to force log items in the AIL to the pinned state. This option can be used by test infrastructure to induce head behind tail conditions. Specifically, this is intended to be used by xfstests to reproduce log recovery problems after failed/corrupted log writes overwrite the last good tail LSN in the log. When enabled, AIL push attempts see log items in the AIL in the pinned state. This stalls metadata writeback and thus prevents the current tail of the log from moving forward. When disabled, subsequent AIL pushes observe the log items in their appropriate state and filesystem operation continues as normal. Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_error.c | 3 +++ fs/xfs/xfs_error.h | 4 +++- fs/xfs/xfs_trans_ail.c | 17 ++++++++++++++++- 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 2f4feb959bfb4..bd786a9ac2c38 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -57,6 +57,7 @@ static unsigned int xfs_errortag_random_default[] = { XFS_RANDOM_AG_RESV_CRITICAL, XFS_RANDOM_DROP_WRITES, XFS_RANDOM_LOG_BAD_CRC, + XFS_RANDOM_LOG_ITEM_PIN, }; struct xfs_errortag_attr { @@ -161,6 +162,7 @@ XFS_ERRORTAG_ATTR_RW(bmap_finish_one, XFS_ERRTAG_BMAP_FINISH_ONE); XFS_ERRORTAG_ATTR_RW(ag_resv_critical, XFS_ERRTAG_AG_RESV_CRITICAL); XFS_ERRORTAG_ATTR_RW(drop_writes, XFS_ERRTAG_DROP_WRITES); XFS_ERRORTAG_ATTR_RW(log_bad_crc, XFS_ERRTAG_LOG_BAD_CRC); +XFS_ERRORTAG_ATTR_RW(log_item_pin, XFS_ERRTAG_LOG_ITEM_PIN); static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(noerror), @@ -193,6 +195,7 @@ static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(ag_resv_critical), XFS_ERRORTAG_ATTR_LIST(drop_writes), XFS_ERRORTAG_ATTR_LIST(log_bad_crc), + XFS_ERRORTAG_ATTR_LIST(log_item_pin), NULL, }; diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 7577be5f09bc0..7c4bef3bddb75 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -106,7 +106,8 @@ extern void xfs_verifier_error(struct xfs_buf *bp); */ #define XFS_ERRTAG_DROP_WRITES 28 #define XFS_ERRTAG_LOG_BAD_CRC 29 -#define XFS_ERRTAG_MAX 30 +#define XFS_ERRTAG_LOG_ITEM_PIN 30 +#define XFS_ERRTAG_MAX 31 /* * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. @@ -141,6 +142,7 @@ extern void xfs_verifier_error(struct xfs_buf *bp); #define XFS_RANDOM_AG_RESV_CRITICAL 4 #define XFS_RANDOM_DROP_WRITES 1 #define XFS_RANDOM_LOG_BAD_CRC 1 +#define XFS_RANDOM_LOG_ITEM_PIN 1 #ifdef DEBUG extern int xfs_errortag_init(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 70f5ab0173236..354368a906e59 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -325,6 +325,21 @@ xfs_ail_delete( xfs_trans_ail_cursor_clear(ailp, lip); } +static inline uint +xfsaild_push_item( + struct xfs_ail *ailp, + struct xfs_log_item *lip) +{ + /* + * If log item pinning is enabled, skip the push and track the item as + * pinned. This can help induce head-behind-tail conditions. + */ + if (XFS_TEST_ERROR(false, ailp->xa_mount, XFS_ERRTAG_LOG_ITEM_PIN)) + return XFS_ITEM_PINNED; + + return lip->li_ops->iop_push(lip, &ailp->xa_buf_list); +} + static long xfsaild_push( struct xfs_ail *ailp) @@ -382,7 +397,7 @@ xfsaild_push( * rely on the AIL cursor implementation to be able to deal with * the dropped lock. */ - lock_result = lip->li_ops->iop_push(lip, &ailp->xa_buf_list); + lock_result = xfsaild_push_item(ailp, lip); switch (lock_result) { case XFS_ITEM_SUCCESS: XFS_STATS_INC(mp, xs_push_ail_success); From a4c9b34d6a17081005ec459b57b8effc08f4c731 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 8 Aug 2017 18:21:53 -0700 Subject: [PATCH 09/44] xfs: handle -EFSCORRUPTED during head/tail verification Torn write and tail overwrite detection both trigger only on -EFSBADCRC errors. While this is the most likely failure scenario for each condition, -EFSCORRUPTED is still possible in certain cases depending on what ends up on disk when a torn write or partial tail overwrite occurs. For example, an invalid log record h_len can lead to an -EFSCORRUPTED error when running the log recovery CRC pass. Therefore, update log head and tail verification to trigger the associated head/tail fixups in the event of -EFSCORRUPTED errors along with -EFSBADCRC. Also, -EFSCORRUPTED can currently be returned from xlog_do_recovery_pass() before rhead_blk is initialized if the first record encountered happens to be corrupted. This leads to an incorrect 'first_bad' return value. Initialize rhead_blk earlier in the function to address that problem as well. Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_log_recover.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index a5e2ca8f5cd63..43d5df3a563f7 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1102,7 +1102,7 @@ xlog_verify_tail( first_bad = 0; error = xlog_do_recovery_pass(log, head_blk, *tail_blk, XLOG_RECOVER_CRCPASS, &first_bad); - while (error == -EFSBADCRC && first_bad) { + while ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) { int tail_distance; /* @@ -1188,7 +1188,7 @@ xlog_verify_head( */ error = xlog_do_recovery_pass(log, *head_blk, tmp_rhead_blk, XLOG_RECOVER_CRCPASS, &first_bad); - if (error == -EFSBADCRC) { + if ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) { /* * We've hit a potential torn write. Reset the error and warn * about it. @@ -5257,7 +5257,7 @@ xlog_do_recovery_pass( LIST_HEAD (buffer_list); ASSERT(head_blk != tail_blk); - rhead_blk = 0; + blk_no = rhead_blk = tail_blk; for (i = 0; i < XLOG_RHASH_SIZE; i++) INIT_HLIST_HEAD(&rhash[i]); @@ -5335,7 +5335,6 @@ xlog_do_recovery_pass( } memset(rhash, 0, sizeof(rhash)); - blk_no = rhead_blk = tail_blk; if (tail_blk > head_blk) { /* * Perform recovery around the end of the physical log. From e67d3d4246e5fbb0c7c700426d11241ca9c6f473 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 8 Aug 2017 18:21:53 -0700 Subject: [PATCH 10/44] xfs: add log recovery tracepoint for head/tail Torn write detection and tail overwrite detection can shift the log head and tail respectively in the event of CRC mismatch or corruption errors. Add a high-level log recovery tracepoint to dump the final log head/tail and make those values easily attainable in debug/diagnostic situations. Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_log_recover.c | 2 ++ fs/xfs/xfs_trace.h | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 43d5df3a563f7..a36239980cf71 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -5598,6 +5598,8 @@ xlog_do_recover( xfs_buf_t *bp; xfs_sb_t *sbp; + trace_xfs_log_recover(log, head_blk, tail_blk); + /* * First replay the images in the log. */ diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index bcc3cdf8e1c5d..68810477ef2c2 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1963,6 +1963,24 @@ DEFINE_EVENT(xfs_swap_extent_class, name, \ DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before); DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after); +TRACE_EVENT(xfs_log_recover, + TP_PROTO(struct xlog *log, xfs_daddr_t headblk, xfs_daddr_t tailblk), + TP_ARGS(log, headblk, tailblk), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_daddr_t, headblk) + __field(xfs_daddr_t, tailblk) + ), + TP_fast_assign( + __entry->dev = log->l_mp->m_super->s_dev; + __entry->headblk = headblk; + __entry->tailblk = tailblk; + ), + TP_printk("dev %d:%d headblk 0x%llx tailblk 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->headblk, + __entry->tailblk) +) + TRACE_EVENT(xfs_log_recover_record, TP_PROTO(struct xlog *log, struct xlog_rec_header *rhead, int pass), TP_ARGS(log, rhead, pass), From 2d32311cf19bfb8c1d2b4601974ddd951f9cfd0b Mon Sep 17 00:00:00 2001 From: Carlos Maiolino Date: Fri, 18 Aug 2017 18:07:04 -0700 Subject: [PATCH 11/44] xfs: stop searching for free slots in an inode chunk when there are none In a filesystem without finobt, the Space manager selects an AG to alloc a new inode, where xfs_dialloc_ag_inobt() will search the AG for the free slot chunk. When the new inode is in the same AG as its parent, the btree will be searched starting on the parent's record, and then retried from the top if no slot is available beyond the parent's record. To exit this loop though, xfs_dialloc_ag_inobt() relies on the fact that the btree must have a free slot available, once its callers relied on the agi->freecount when deciding how/where to allocate this new inode. In the case when the agi->freecount is corrupted, showing available inodes in an AG, when in fact there is none, this becomes an infinite loop. Add a way to stop the loop when a free slot is not found in the btree, making the function to fall into the whole AG scan which will then, be able to detect the corruption and shut the filesystem down. As pointed by Brian, this might impact performance, giving the fact we don't reset the search distance anymore when we reach the end of the tree, giving it fewer tries before falling back to the whole AG search, but it will only affect searches that start within 10 records to the end of the tree. Signed-off-by: Carlos Maiolino Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_ialloc.c | 55 +++++++++++++++++++------------------- 1 file changed, 27 insertions(+), 28 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index abf5beaae907d..1e0658a3f155a 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -1133,6 +1133,7 @@ xfs_dialloc_ag_inobt( int error; int offset; int i, j; + int searchdistance = 10; pag = xfs_perag_get(mp, agno); @@ -1159,7 +1160,6 @@ xfs_dialloc_ag_inobt( if (pagno == agno) { int doneleft; /* done, to the left */ int doneright; /* done, to the right */ - int searchdistance = 10; error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i); if (error) @@ -1220,21 +1220,9 @@ xfs_dialloc_ag_inobt( /* * Loop until we find an inode chunk with a free inode. */ - while (!doneleft || !doneright) { + while (--searchdistance > 0 && (!doneleft || !doneright)) { int useleft; /* using left inode chunk this time */ - if (!--searchdistance) { - /* - * Not in range - save last search - * location and allocate a new inode - */ - xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); - pag->pagl_leftrec = trec.ir_startino; - pag->pagl_rightrec = rec.ir_startino; - pag->pagl_pagino = pagino; - goto newino; - } - /* figure out the closer block if both are valid. */ if (!doneleft && !doneright) { useleft = pagino - @@ -1278,26 +1266,37 @@ xfs_dialloc_ag_inobt( goto error1; } - /* - * We've reached the end of the btree. because - * we are only searching a small chunk of the - * btree each search, there is obviously free - * inodes closer to the parent inode than we - * are now. restart the search again. - */ - pag->pagl_pagino = NULLAGINO; - pag->pagl_leftrec = NULLAGINO; - pag->pagl_rightrec = NULLAGINO; - xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); - xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); - goto restart_pagno; + if (searchdistance <= 0) { + /* + * Not in range - save last search + * location and allocate a new inode + */ + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + pag->pagl_leftrec = trec.ir_startino; + pag->pagl_rightrec = rec.ir_startino; + pag->pagl_pagino = pagino; + + } else { + /* + * We've reached the end of the btree. because + * we are only searching a small chunk of the + * btree each search, there is obviously free + * inodes closer to the parent inode than we + * are now. restart the search again. + */ + pag->pagl_pagino = NULLAGINO; + pag->pagl_leftrec = NULLAGINO; + pag->pagl_rightrec = NULLAGINO; + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + goto restart_pagno; + } } /* * In a different AG from the parent. * See if the most recently allocated block has any free. */ -newino: if (agi->agi_newino != cpu_to_be32(NULLAGINO)) { error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino), XFS_LOOKUP_EQ, &i); From 799ea9e9c59949008770aab4e1da87f10e99dbe4 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 18 Aug 2017 18:08:25 -0700 Subject: [PATCH 12/44] xfs: evict all inodes involved with log redo item When we introduced the bmap redo log items, we set MS_ACTIVE on the mountpoint and XFS_IRECOVERY on the inode to prevent unlinked inodes from being truncated prematurely during log recovery. This also had the effect of putting linked inodes on the lru instead of evicting them. Unfortunately, we neglected to find all those unreferenced lru inodes and evict them after finishing log recovery, which means that we leak them if anything goes wrong in the rest of xfs_mountfs, because the lru is only cleaned out on unmount. Therefore, evict unreferenced inodes in the lru list immediately after clearing MS_ACTIVE. Fixes: 17c12bcd30 ("xfs: when replaying bmap operations, don't let unlinked inodes get reaped") Signed-off-by: Darrick J. Wong Cc: viro@ZenIV.linux.org.uk Reviewed-by: Brian Foster --- fs/inode.c | 1 + fs/internal.h | 1 - fs/xfs/xfs_log.c | 12 ++++++++++++ include/linux/fs.h | 1 + 4 files changed, 14 insertions(+), 1 deletion(-) diff --git a/fs/inode.c b/fs/inode.c index 50370599e3710..6a1626e0edafc 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -637,6 +637,7 @@ void evict_inodes(struct super_block *sb) dispose_list(&dispose); } +EXPORT_SYMBOL_GPL(evict_inodes); /** * invalidate_inodes - attempt to free all inodes on a superblock diff --git a/fs/internal.h b/fs/internal.h index 9676fe11c0935..fedfe94d84ba5 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -132,7 +132,6 @@ static inline bool atime_needs_update_rcu(const struct path *path, extern void inode_io_list_del(struct inode *inode); extern long get_nr_dirty_inodes(void); -extern void evict_inodes(struct super_block *); extern int invalidate_inodes(struct super_block *, bool); /* diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index f467f1230444f..bcb2f860e508d 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -761,12 +761,24 @@ xfs_log_mount_finish( * inodes. Turn it off immediately after recovery finishes * so that we don't leak the quota inodes if subsequent mount * activities fail. + * + * We let all inodes involved in redo item processing end up on + * the LRU instead of being evicted immediately so that if we do + * something to an unlinked inode, the irele won't cause + * premature truncation and freeing of the inode, which results + * in log recovery failure. We have to evict the unreferenced + * lru inodes after clearing MS_ACTIVE because we don't + * otherwise clean up the lru if there's a subsequent failure in + * xfs_mountfs, which leads to us leaking the inodes if nothing + * else (e.g. quotacheck) references the inodes before the + * mount failure occurs. */ mp->m_super->s_flags |= MS_ACTIVE; error = xlog_recover_finish(mp->m_log); if (!error) xfs_log_work_queue(mp); mp->m_super->s_flags &= ~MS_ACTIVE; + evict_inodes(mp->m_super); if (readonly) mp->m_flags |= XFS_MOUNT_RDONLY; diff --git a/include/linux/fs.h b/include/linux/fs.h index 6e1fd5d212487..00f46e0183706 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2831,6 +2831,7 @@ static inline void lockdep_annotate_inode_mutex_key(struct inode *inode) { }; #endif extern void unlock_new_inode(struct inode *); extern unsigned int get_next_ino(void); +extern void evict_inodes(struct super_block *sb); extern void __iget(struct inode * inode); extern void iget_failed(struct inode *); From f2e9ad212def50bcf4c098c6288779dd97fff0f0 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Fri, 25 Aug 2017 10:05:26 -0700 Subject: [PATCH 13/44] xfs: check for race with xfs_reclaim_inode() in xfs_ifree_cluster() After xfs_ifree_cluster() finds an inode in the radix tree and verifies that the inode number is what it expected, xfs_reclaim_inode() can swoop in and free it. xfs_ifree_cluster() will then happily continue working on the freed inode. Most importantly, it will mark the inode stale, which will probably be overwritten when the inode slab object is reallocated, but if it has already been reallocated then we can end up with an inode spuriously marked stale. In 8a17d7ddedb4 ("xfs: mark reclaimed inodes invalid earlier") we added a second check to xfs_iflush_cluster() to detect this race, but the similar RCU lookup in xfs_ifree_cluster() needs the same treatment. Signed-off-by: Omar Sandoval Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_icache.c | 10 +++++----- fs/xfs/xfs_inode.c | 23 ++++++++++++++++++----- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 0a9e6985a0d09..34227115a5d6e 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1124,11 +1124,11 @@ xfs_reclaim_inode( * Because we use RCU freeing we need to ensure the inode always appears * to be reclaimed with an invalid inode number when in the free state. * We do this as early as possible under the ILOCK so that - * xfs_iflush_cluster() can be guaranteed to detect races with us here. - * By doing this, we guarantee that once xfs_iflush_cluster has locked - * XFS_ILOCK that it will see either a valid, flushable inode that will - * serialise correctly, or it will see a clean (and invalid) inode that - * it can skip. + * xfs_iflush_cluster() and xfs_ifree_cluster() can be guaranteed to + * detect races with us here. By doing this, we guarantee that once + * xfs_iflush_cluster() or xfs_ifree_cluster() has locked XFS_ILOCK that + * it will see either a valid inode that will serialise correctly, or it + * will see an invalid inode that it can skip. */ spin_lock(&ip->i_flags_lock); ip->i_flags = XFS_IRECLAIM; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index ff48f00968100..97045e8dfed5f 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2359,11 +2359,24 @@ xfs_ifree_cluster( * already marked stale. If we can't lock it, back off * and retry. */ - if (ip != free_ip && - !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { - rcu_read_unlock(); - delay(1); - goto retry; + if (ip != free_ip) { + if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { + rcu_read_unlock(); + delay(1); + goto retry; + } + + /* + * Check the inode number again in case we're + * racing with freeing in xfs_reclaim_inode(). + * See the comments in that function for more + * information as to why the initial check is + * not sufficient. + */ + if (ip->i_ino != inum + i) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + continue; + } } rcu_read_unlock(); From 411350df14a3d6f1c769ea64a8b43a71f8d9760e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 28 Aug 2017 10:21:03 -0700 Subject: [PATCH 14/44] xfs: refactor xfs_trans_roll Split xfs_trans_roll into a low-level helper that just rolls the actual transaction and a new higher level xfs_trans_roll_inode that takes care of logging and rejoining the inode. This gets rid of the NULL inode case, and allows to simplify the special cases in the deferred operation code. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_attr.c | 16 ++++++++-------- fs/xfs/libxfs/xfs_attr_leaf.c | 6 +++--- fs/xfs/libxfs/xfs_attr_remote.c | 4 ++-- fs/xfs/libxfs/xfs_defer.c | 23 +++++++++-------------- fs/xfs/xfs_attr_inactive.c | 6 +++--- fs/xfs/xfs_inode.c | 4 ++-- fs/xfs/xfs_trans.c | 28 +++++----------------------- fs/xfs/xfs_trans.h | 3 ++- fs/xfs/xfs_trans_inode.c | 14 ++++++++++++++ 9 files changed, 48 insertions(+), 56 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index de7b9bd30becc..bafa0f6bfafac 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -341,7 +341,7 @@ xfs_attr_set( * transaction to add the new attribute to the leaf. */ - error = xfs_trans_roll(&args.trans, dp); + error = xfs_trans_roll_inode(&args.trans, dp); if (error) goto out; @@ -605,7 +605,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * Commit the current trans (including the inode) and start * a new one. */ - error = xfs_trans_roll(&args->trans, dp); + error = xfs_trans_roll_inode(&args->trans, dp); if (error) return error; @@ -620,7 +620,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * Commit the transaction that added the attr name so that * later routines can manage their own transactions. */ - error = xfs_trans_roll(&args->trans, dp); + error = xfs_trans_roll_inode(&args->trans, dp); if (error) return error; @@ -697,7 +697,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) /* * Commit the remove and start the next trans in series. */ - error = xfs_trans_roll(&args->trans, dp); + error = xfs_trans_roll_inode(&args->trans, dp); } else if (args->rmtblkno > 0) { /* @@ -885,7 +885,7 @@ xfs_attr_node_addname(xfs_da_args_t *args) * Commit the node conversion and start the next * trans in the chain. */ - error = xfs_trans_roll(&args->trans, dp); + error = xfs_trans_roll_inode(&args->trans, dp); if (error) goto out; @@ -925,7 +925,7 @@ xfs_attr_node_addname(xfs_da_args_t *args) * Commit the leaf addition or btree split and start the next * trans in the chain. */ - error = xfs_trans_roll(&args->trans, dp); + error = xfs_trans_roll_inode(&args->trans, dp); if (error) goto out; @@ -1012,7 +1012,7 @@ xfs_attr_node_addname(xfs_da_args_t *args) /* * Commit and start the next trans in the chain. */ - error = xfs_trans_roll(&args->trans, dp); + error = xfs_trans_roll_inode(&args->trans, dp); if (error) goto out; @@ -1132,7 +1132,7 @@ xfs_attr_node_removename(xfs_da_args_t *args) /* * Commit the Btree join operation and start a new trans. */ - error = xfs_trans_roll(&args->trans, dp); + error = xfs_trans_roll_inode(&args->trans, dp); if (error) goto out; } diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index c6c15e5717e42..5c16db86b38ff 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -2608,7 +2608,7 @@ xfs_attr3_leaf_clearflag( /* * Commit the flag value change and start the next trans in series. */ - return xfs_trans_roll(&args->trans, args->dp); + return xfs_trans_roll_inode(&args->trans, args->dp); } /* @@ -2659,7 +2659,7 @@ xfs_attr3_leaf_setflag( /* * Commit the flag value change and start the next trans in series. */ - return xfs_trans_roll(&args->trans, args->dp); + return xfs_trans_roll_inode(&args->trans, args->dp); } /* @@ -2777,7 +2777,7 @@ xfs_attr3_leaf_flipflags( /* * Commit the flag value change and start the next trans in series. */ - error = xfs_trans_roll(&args->trans, args->dp); + error = xfs_trans_roll_inode(&args->trans, args->dp); return error; } diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index 5236d8e451463..433c36714e408 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -484,7 +484,7 @@ xfs_attr_rmtval_set( /* * Start the next trans in the chain. */ - error = xfs_trans_roll(&args->trans, dp); + error = xfs_trans_roll_inode(&args->trans, dp); if (error) return error; } @@ -621,7 +621,7 @@ xfs_attr_rmtval_remove( /* * Close out trans and start the next one in the chain. */ - error = xfs_trans_roll(&args->trans, args->dp); + error = xfs_trans_roll_inode(&args->trans, args->dp); if (error) return error; } diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 5c2929f94bd3b..4ea2f068d95c2 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -240,23 +240,19 @@ xfs_defer_trans_abort( STATIC int xfs_defer_trans_roll( struct xfs_trans **tp, - struct xfs_defer_ops *dop, - struct xfs_inode *ip) + struct xfs_defer_ops *dop) { int i; int error; - /* Log all the joined inodes except the one we passed in. */ - for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++) { - if (dop->dop_inodes[i] == ip) - continue; + /* Log all the joined inodes. */ + for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++) xfs_trans_log_inode(*tp, dop->dop_inodes[i], XFS_ILOG_CORE); - } trace_xfs_defer_trans_roll((*tp)->t_mountp, dop); /* Roll the transaction. */ - error = xfs_trans_roll(tp, ip); + error = xfs_trans_roll(tp); if (error) { trace_xfs_defer_trans_roll_error((*tp)->t_mountp, dop, error); xfs_defer_trans_abort(*tp, dop, error); @@ -264,12 +260,9 @@ xfs_defer_trans_roll( } dop->dop_committed = true; - /* Rejoin the joined inodes except the one we passed in. */ - for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++) { - if (dop->dop_inodes[i] == ip) - continue; + /* Rejoin the joined inodes. */ + for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++) xfs_trans_ijoin(*tp, dop->dop_inodes[i], 0); - } return error; } @@ -331,13 +324,15 @@ xfs_defer_finish( trace_xfs_defer_finish((*tp)->t_mountp, dop); + xfs_defer_join(dop, ip); + /* Until we run out of pending work to finish... */ while (xfs_defer_has_unfinished_work(dop)) { /* Log intents for work items sitting in the intake. */ xfs_defer_intake_work(*tp, dop); /* Roll the transaction. */ - error = xfs_defer_trans_roll(tp, dop, ip); + error = xfs_defer_trans_roll(tp, dop); if (error) goto out; diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index be0b79d8900f0..ebd66b19fbfc3 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -97,7 +97,7 @@ xfs_attr3_leaf_freextent( /* * Roll to next transaction. */ - error = xfs_trans_roll(trans, dp); + error = xfs_trans_roll_inode(trans, dp); if (error) return error; } @@ -308,7 +308,7 @@ xfs_attr3_node_inactive( /* * Atomically commit the whole invalidate stuff. */ - error = xfs_trans_roll(trans, dp); + error = xfs_trans_roll_inode(trans, dp); if (error) return error; } @@ -375,7 +375,7 @@ xfs_attr3_root_inactive( /* * Commit the invalidate and start the next transaction. */ - error = xfs_trans_roll(trans, dp); + error = xfs_trans_roll_inode(trans, dp); return error; } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 97045e8dfed5f..f739a031986d2 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1055,7 +1055,7 @@ xfs_dir_ialloc( tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY); } - code = xfs_trans_roll(&tp, NULL); + code = xfs_trans_roll(&tp); if (committed != NULL) *committed = 1; @@ -1611,7 +1611,7 @@ xfs_itruncate_extents( if (error) goto out_bmap_cancel; - error = xfs_trans_roll(&tp, ip); + error = xfs_trans_roll_inode(&tp, ip); if (error) goto out; } diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 2011620008de8..a87f657f59c96 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1035,25 +1035,18 @@ xfs_trans_cancel( */ int xfs_trans_roll( - struct xfs_trans **tpp, - struct xfs_inode *dp) + struct xfs_trans **tpp) { - struct xfs_trans *trans; + struct xfs_trans *trans = *tpp; struct xfs_trans_res tres; int error; - /* - * Ensure that the inode is always logged. - */ - trans = *tpp; - if (dp) - xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE); - /* * Copy the critical parameters from one trans to the next. */ tres.tr_logres = trans->t_log_res; tres.tr_logcount = trans->t_log_count; + *tpp = xfs_trans_dup(trans); /* @@ -1067,10 +1060,8 @@ xfs_trans_roll( if (error) return error; - trans = *tpp; - /* - * Reserve space in the log for th next transaction. + * Reserve space in the log for the next transaction. * This also pushes items in the "AIL", the list of logged items, * out to disk if they are taking up space at the tail of the log * that we want to use. This requires that either nothing be locked @@ -1078,14 +1069,5 @@ xfs_trans_roll( * the prior and the next transactions. */ tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; - error = xfs_trans_reserve(trans, &tres, 0, 0); - /* - * Ensure that the inode is in the new transaction and locked. - */ - if (error) - return error; - - if (dp) - xfs_trans_ijoin(trans, dp, 0); - return 0; + return xfs_trans_reserve(*tpp, &tres, 0, 0); } diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 7d627721e4b31..b25d3d22e2895 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -228,7 +228,8 @@ int xfs_trans_free_extent(struct xfs_trans *, struct xfs_efd_log_item *, xfs_fsblock_t, xfs_extlen_t, struct xfs_owner_info *); int xfs_trans_commit(struct xfs_trans *); -int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *); +int xfs_trans_roll(struct xfs_trans **); +int xfs_trans_roll_inode(struct xfs_trans **, struct xfs_inode *); void xfs_trans_cancel(xfs_trans_t *); int xfs_trans_ail_init(struct xfs_mount *); void xfs_trans_ail_destroy(struct xfs_mount *); diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c index dab8daa676f94..daa7615497f98 100644 --- a/fs/xfs/xfs_trans_inode.c +++ b/fs/xfs/xfs_trans_inode.c @@ -134,3 +134,17 @@ xfs_trans_log_inode( flags |= ip->i_itemp->ili_last_fields; ip->i_itemp->ili_fields |= flags; } + +int +xfs_trans_roll_inode( + struct xfs_trans **tpp, + struct xfs_inode *ip) +{ + int error; + + xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE); + error = xfs_trans_roll(tpp); + if (!error) + xfs_trans_ijoin(*tpp, ip, 0); + return error; +} From 882d8785fb87f691000a0b33c215364d74bd2ceb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 28 Aug 2017 10:21:03 -0700 Subject: [PATCH 15/44] xfs: rename xfs_defer_join to xfs_defer_ijoin Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap.c | 2 +- fs/xfs/libxfs/xfs_defer.c | 4 ++-- fs/xfs/libxfs/xfs_defer.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index c09c16b1ad3b8..dcefadd4fc3a9 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -6452,7 +6452,7 @@ __xfs_bmap_add( bi->bi_whichfork = whichfork; bi->bi_bmap = *bmap; - error = xfs_defer_join(dfops, bi->bi_owner); + error = xfs_defer_ijoin(dfops, bi->bi_owner); if (error) { kmem_free(bi); return error; diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 4ea2f068d95c2..6c0da24c68c9b 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -281,7 +281,7 @@ xfs_defer_has_unfinished_work( * to xfs_defer_finish(). */ int -xfs_defer_join( +xfs_defer_ijoin( struct xfs_defer_ops *dop, struct xfs_inode *ip) { @@ -324,7 +324,7 @@ xfs_defer_finish( trace_xfs_defer_finish((*tp)->t_mountp, dop); - xfs_defer_join(dop, ip); + xfs_defer_ijoin(dop, ip); /* Until we run out of pending work to finish... */ while (xfs_defer_has_unfinished_work(dop)) { diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index f6e93ef0bffe8..70c944b21a2a3 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -77,7 +77,7 @@ int xfs_defer_finish(struct xfs_trans **tp, struct xfs_defer_ops *dop, void xfs_defer_cancel(struct xfs_defer_ops *dop); void xfs_defer_init(struct xfs_defer_ops *dop, xfs_fsblock_t *fbp); bool xfs_defer_has_unfinished_work(struct xfs_defer_ops *dop); -int xfs_defer_join(struct xfs_defer_ops *dop, struct xfs_inode *ip); +int xfs_defer_ijoin(struct xfs_defer_ops *dop, struct xfs_inode *ip); /* Description of a deferred type. */ struct xfs_defer_op_type { From 8ad7c629b18695ec1ee8654fb27599864049862b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 28 Aug 2017 10:21:04 -0700 Subject: [PATCH 16/44] xfs: remove the ip argument to xfs_defer_finish And instead require callers to explicitly join the inode using xfs_defer_ijoin. Also consolidate the defer error handling in a few places using a goto label. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_attr.c | 140 +++++++++++++++++--------------- fs/xfs/libxfs/xfs_attr_remote.c | 35 ++++---- fs/xfs/libxfs/xfs_bmap.c | 4 +- fs/xfs/libxfs/xfs_defer.c | 8 +- fs/xfs/libxfs/xfs_defer.h | 3 +- fs/xfs/libxfs/xfs_refcount.c | 2 +- fs/xfs/xfs_bmap_item.c | 2 +- fs/xfs/xfs_bmap_util.c | 10 ++- fs/xfs/xfs_dquot.c | 2 +- fs/xfs/xfs_inode.c | 13 +-- fs/xfs/xfs_iomap.c | 6 +- fs/xfs/xfs_refcount_item.c | 2 +- fs/xfs/xfs_reflink.c | 11 ++- fs/xfs/xfs_rtalloc.c | 2 +- fs/xfs/xfs_symlink.c | 5 +- 15 files changed, 129 insertions(+), 116 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index bafa0f6bfafac..6249c92671deb 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -328,13 +328,12 @@ xfs_attr_set( */ xfs_defer_init(args.dfops, args.firstblock); error = xfs_attr_shortform_to_leaf(&args); - if (!error) - error = xfs_defer_finish(&args.trans, args.dfops, dp); - if (error) { - args.trans = NULL; - xfs_defer_cancel(&dfops); - goto out; - } + if (error) + goto out_defer_cancel; + xfs_defer_ijoin(args.dfops, dp); + error = xfs_defer_finish(&args.trans, args.dfops); + if (error) + goto out_defer_cancel; /* * Commit the leaf transformation. We'll need another (linked) @@ -373,6 +372,9 @@ xfs_attr_set( return error; +out_defer_cancel: + xfs_defer_cancel(&dfops); + args.trans = NULL; out: if (args.trans) xfs_trans_cancel(args.trans); @@ -593,13 +595,12 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) */ xfs_defer_init(args->dfops, args->firstblock); error = xfs_attr3_leaf_to_node(args); - if (!error) - error = xfs_defer_finish(&args->trans, args->dfops, dp); - if (error) { - args->trans = NULL; - xfs_defer_cancel(args->dfops); - return error; - } + if (error) + goto out_defer_cancel; + xfs_defer_ijoin(args->dfops, dp); + error = xfs_defer_finish(&args->trans, args->dfops); + if (error) + goto out_defer_cancel; /* * Commit the current trans (including the inode) and start @@ -684,14 +685,12 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) xfs_defer_init(args->dfops, args->firstblock); error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ - if (!error) - error = xfs_defer_finish(&args->trans, - args->dfops, dp); - if (error) { - args->trans = NULL; - xfs_defer_cancel(args->dfops); - return error; - } + if (error) + goto out_defer_cancel; + xfs_defer_ijoin(args->dfops, dp); + error = xfs_defer_finish(&args->trans, args->dfops); + if (error) + goto out_defer_cancel; } /* @@ -706,6 +705,10 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) error = xfs_attr3_leaf_clearflag(args); } return error; +out_defer_cancel: + xfs_defer_cancel(args->dfops); + args->trans = NULL; + return error; } /* @@ -747,15 +750,18 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) xfs_defer_init(args->dfops, args->firstblock); error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ - if (!error) - error = xfs_defer_finish(&args->trans, args->dfops, dp); - if (error) { - args->trans = NULL; - xfs_defer_cancel(args->dfops); - return error; - } + if (error) + goto out_defer_cancel; + xfs_defer_ijoin(args->dfops, dp); + error = xfs_defer_finish(&args->trans, args->dfops); + if (error) + goto out_defer_cancel; } return 0; +out_defer_cancel: + xfs_defer_cancel(args->dfops); + args->trans = NULL; + return error; } /* @@ -872,14 +878,12 @@ xfs_attr_node_addname(xfs_da_args_t *args) state = NULL; xfs_defer_init(args->dfops, args->firstblock); error = xfs_attr3_leaf_to_node(args); - if (!error) - error = xfs_defer_finish(&args->trans, - args->dfops, dp); - if (error) { - args->trans = NULL; - xfs_defer_cancel(args->dfops); - goto out; - } + if (error) + goto out_defer_cancel; + xfs_defer_ijoin(args->dfops, dp); + error = xfs_defer_finish(&args->trans, args->dfops); + if (error) + goto out_defer_cancel; /* * Commit the node conversion and start the next @@ -900,13 +904,12 @@ xfs_attr_node_addname(xfs_da_args_t *args) */ xfs_defer_init(args->dfops, args->firstblock); error = xfs_da3_split(state); - if (!error) - error = xfs_defer_finish(&args->trans, args->dfops, dp); - if (error) { - args->trans = NULL; - xfs_defer_cancel(args->dfops); - goto out; - } + if (error) + goto out_defer_cancel; + xfs_defer_ijoin(args->dfops, dp); + error = xfs_defer_finish(&args->trans, args->dfops); + if (error) + goto out_defer_cancel; } else { /* * Addition succeeded, update Btree hashvals. @@ -999,14 +1002,12 @@ xfs_attr_node_addname(xfs_da_args_t *args) if (retval && (state->path.active > 1)) { xfs_defer_init(args->dfops, args->firstblock); error = xfs_da3_join(state); - if (!error) - error = xfs_defer_finish(&args->trans, - args->dfops, dp); - if (error) { - args->trans = NULL; - xfs_defer_cancel(args->dfops); - goto out; - } + if (error) + goto out_defer_cancel; + xfs_defer_ijoin(args->dfops, dp); + error = xfs_defer_finish(&args->trans, args->dfops); + if (error) + goto out_defer_cancel; } /* @@ -1032,6 +1033,10 @@ xfs_attr_node_addname(xfs_da_args_t *args) if (error) return error; return retval; +out_defer_cancel: + xfs_defer_cancel(args->dfops); + args->trans = NULL; + goto out; } /* @@ -1122,13 +1127,12 @@ xfs_attr_node_removename(xfs_da_args_t *args) if (retval && (state->path.active > 1)) { xfs_defer_init(args->dfops, args->firstblock); error = xfs_da3_join(state); - if (!error) - error = xfs_defer_finish(&args->trans, args->dfops, dp); - if (error) { - args->trans = NULL; - xfs_defer_cancel(args->dfops); - goto out; - } + if (error) + goto out_defer_cancel; + xfs_defer_ijoin(args->dfops, dp); + error = xfs_defer_finish(&args->trans, args->dfops); + if (error) + goto out_defer_cancel; /* * Commit the Btree join operation and start a new trans. */ @@ -1156,14 +1160,12 @@ xfs_attr_node_removename(xfs_da_args_t *args) xfs_defer_init(args->dfops, args->firstblock); error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ - if (!error) - error = xfs_defer_finish(&args->trans, - args->dfops, dp); - if (error) { - args->trans = NULL; - xfs_defer_cancel(args->dfops); - goto out; - } + if (error) + goto out_defer_cancel; + xfs_defer_ijoin(args->dfops, dp); + error = xfs_defer_finish(&args->trans, args->dfops); + if (error) + goto out_defer_cancel; } else xfs_trans_brelse(args->trans, bp); } @@ -1172,6 +1174,10 @@ xfs_attr_node_removename(xfs_da_args_t *args) out: xfs_da_state_free(state); return error; +out_defer_cancel: + xfs_defer_cancel(args->dfops); + args->trans = NULL; + goto out; } /* diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index 433c36714e408..d56caf037ca0e 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -467,13 +467,12 @@ xfs_attr_rmtval_set( error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno, blkcnt, XFS_BMAPI_ATTRFORK, args->firstblock, args->total, &map, &nmap, args->dfops); - if (!error) - error = xfs_defer_finish(&args->trans, args->dfops, dp); - if (error) { - args->trans = NULL; - xfs_defer_cancel(args->dfops); - return error; - } + if (error) + goto out_defer_cancel; + xfs_defer_ijoin(args->dfops, dp); + error = xfs_defer_finish(&args->trans, args->dfops); + if (error) + goto out_defer_cancel; ASSERT(nmap == 1); ASSERT((map.br_startblock != DELAYSTARTBLOCK) && @@ -539,6 +538,10 @@ xfs_attr_rmtval_set( } ASSERT(valuelen == 0); return 0; +out_defer_cancel: + xfs_defer_cancel(args->dfops); + args->trans = NULL; + return error; } /* @@ -609,14 +612,12 @@ xfs_attr_rmtval_remove( error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt, XFS_BMAPI_ATTRFORK, 1, args->firstblock, args->dfops, &done); - if (!error) - error = xfs_defer_finish(&args->trans, args->dfops, - args->dp); - if (error) { - args->trans = NULL; - xfs_defer_cancel(args->dfops); - return error; - } + if (error) + goto out_defer_cancel; + xfs_defer_ijoin(args->dfops, args->dp); + error = xfs_defer_finish(&args->trans, args->dfops); + if (error) + goto out_defer_cancel; /* * Close out trans and start the next one in the chain. @@ -626,4 +627,8 @@ xfs_attr_rmtval_remove( return error; } return 0; +out_defer_cancel: + xfs_defer_cancel(args->dfops); + args->trans = NULL; + return error; } diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index dcefadd4fc3a9..28ac796abe455 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1196,7 +1196,7 @@ xfs_bmap_add_attrfork( xfs_log_sb(tp); } - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto bmap_cancel; error = xfs_trans_commit(tp); @@ -6402,7 +6402,7 @@ xfs_bmap_split_extent( if (error) goto out; - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto out; diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 6c0da24c68c9b..072ebfe1d6aeb 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -277,8 +277,7 @@ xfs_defer_has_unfinished_work( /* * Add this inode to the deferred op. Each joined inode is relogged - * each time we roll the transaction, in addition to any inode passed - * to xfs_defer_finish(). + * each time we roll the transaction. */ int xfs_defer_ijoin( @@ -310,8 +309,7 @@ xfs_defer_ijoin( int xfs_defer_finish( struct xfs_trans **tp, - struct xfs_defer_ops *dop, - struct xfs_inode *ip) + struct xfs_defer_ops *dop) { struct xfs_defer_pending *dfp; struct list_head *li; @@ -324,8 +322,6 @@ xfs_defer_finish( trace_xfs_defer_finish((*tp)->t_mountp, dop); - xfs_defer_ijoin(dop, ip); - /* Until we run out of pending work to finish... */ while (xfs_defer_has_unfinished_work(dop)) { /* Log intents for work items sitting in the intake. */ diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index 70c944b21a2a3..d4f046dd44bd4 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -72,8 +72,7 @@ struct xfs_defer_ops { void xfs_defer_add(struct xfs_defer_ops *dop, enum xfs_defer_ops_type type, struct list_head *h); -int xfs_defer_finish(struct xfs_trans **tp, struct xfs_defer_ops *dop, - struct xfs_inode *ip); +int xfs_defer_finish(struct xfs_trans **tp, struct xfs_defer_ops *dop); void xfs_defer_cancel(struct xfs_defer_ops *dop); void xfs_defer_init(struct xfs_defer_ops *dop, xfs_fsblock_t *fbp); bool xfs_defer_has_unfinished_work(struct xfs_defer_ops *dop); diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 45b1c3b4e047b..9d5406b4f6634 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -1679,7 +1679,7 @@ xfs_refcount_recover_cow_leftovers( xfs_bmap_add_free(mp, &dfops, fsb, rr->rr_rrec.rc_blockcount, NULL); - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto out_defer; diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 88073910fa5d4..dd136f7275e4a 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -502,7 +502,7 @@ xfs_bui_recover( } /* Finish transaction, free inodes. */ - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto err_dfops; diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 93e955262d07e..f7f04aebcadef 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1136,7 +1136,7 @@ xfs_alloc_file_space( /* * Complete the transaction */ - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto error0; @@ -1202,7 +1202,8 @@ xfs_unmap_extent( if (error) goto out_bmap_cancel; - error = xfs_defer_finish(&tp, &dfops, ip); + xfs_defer_ijoin(&dfops, ip); + error = xfs_defer_finish(&tp, &dfops); if (error) goto out_bmap_cancel; @@ -1496,7 +1497,7 @@ xfs_shift_file_space( if (error) goto out_bmap_cancel; - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto out_bmap_cancel; @@ -1777,7 +1778,8 @@ xfs_swap_extent_rmap( if (error) goto out_defer; - error = xfs_defer_finish(tpp, &dfops, ip); + xfs_defer_ijoin(&dfops, ip); + error = xfs_defer_finish(tpp, &dfops); if (error) goto out_defer; diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index fd2ef8c2c9a74..cd82429d8df76 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -383,7 +383,7 @@ xfs_qm_dqalloc( xfs_trans_bhold(tp, bp); - error = xfs_defer_finish(tpp, &dfops, NULL); + error = xfs_defer_finish(tpp, &dfops); if (error) goto error1; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index f739a031986d2..5599dda4727af 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1285,7 +1285,7 @@ xfs_create( */ xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto out_bmap_cancel; @@ -1513,7 +1513,7 @@ xfs_link( if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) xfs_trans_set_sync(tp); - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) { xfs_defer_cancel(&dfops); goto error_return; @@ -1607,7 +1607,8 @@ xfs_itruncate_extents( * Duplicate the transaction that has the permanent * reservation and commit the old transaction. */ - error = xfs_defer_finish(&tp, &dfops, ip); + xfs_defer_ijoin(&dfops, ip); + error = xfs_defer_finish(&tp, &dfops); if (error) goto out_bmap_cancel; @@ -1855,7 +1856,7 @@ xfs_inactive_ifree( * Just ignore errors at this point. There is nothing we can do except * to try to keep going. Make sure it's not a silent error. */ - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) { xfs_notice(mp, "%s: xfs_defer_finish returned error %d", __func__, error); @@ -2650,7 +2651,7 @@ xfs_remove( if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) xfs_trans_set_sync(tp); - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto out_bmap_cancel; @@ -2736,7 +2737,7 @@ xfs_finish_rename( if (tp->t_mountp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) xfs_trans_set_sync(tp); - error = xfs_defer_finish(&tp, dfops, NULL); + error = xfs_defer_finish(&tp, dfops); if (error) { xfs_defer_cancel(dfops); xfs_trans_cancel(tp); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 813394c628497..1b625d0504419 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -274,7 +274,7 @@ xfs_iomap_write_direct( /* * Complete the transaction */ - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto out_bmap_cancel; @@ -784,7 +784,7 @@ xfs_iomap_write_allocate( if (error) goto trans_cancel; - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto trans_cancel; @@ -906,7 +906,7 @@ xfs_iomap_write_unwritten( xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto error_on_bmapi_transaction; diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 96fe209b5eb61..8f2e2fac4255d 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -525,7 +525,7 @@ xfs_cui_recover( } xfs_refcount_finish_one_cleanup(tp, rcur, error); - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto abort_defer; set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags); diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index f45fbf0db9bbe..3246815c24d65 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -464,7 +464,7 @@ xfs_reflink_allocate_cow( goto out_bmap_cancel; /* Finish up. */ - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto out_bmap_cancel; @@ -602,7 +602,8 @@ xfs_reflink_cancel_cow_blocks( -(long)del.br_blockcount); /* Roll the transaction */ - error = xfs_defer_finish(tpp, &dfops, ip); + xfs_defer_ijoin(&dfops, ip); + error = xfs_defer_finish(tpp, &dfops); if (error) { xfs_defer_cancel(&dfops); break; @@ -791,7 +792,8 @@ xfs_reflink_end_cow( /* Remove the mapping from the CoW fork. */ xfs_bmap_del_extent_cow(ip, &idx, &got, &del); - error = xfs_defer_finish(&tp, &dfops, ip); + xfs_defer_ijoin(&dfops, ip); + error = xfs_defer_finish(&tp, &dfops); if (error) goto out_defer; next_extent: @@ -1152,7 +1154,8 @@ xfs_reflink_remap_extent( next_extent: /* Process all the deferred stuff. */ - error = xfs_defer_finish(&tp, &dfops, ip); + xfs_defer_ijoin(&dfops, ip); + error = xfs_defer_finish(&tp, &dfops); if (error) goto out_defer; } diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 91472193643b1..488719d43ca82 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -810,7 +810,7 @@ xfs_growfs_rt_alloc( /* * Free any blocks freed up in the transaction, then commit. */ - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto out_bmap_cancel; error = xfs_trans_commit(tp); diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 23a50d7aa46a7..68d3ca2c49680 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -378,7 +378,7 @@ xfs_symlink( xfs_trans_set_sync(tp); } - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto out_bmap_cancel; @@ -497,7 +497,8 @@ xfs_inactive_symlink_rmt( /* * Commit the first transaction. This logs the EFI and the inode. */ - error = xfs_defer_finish(&tp, &dfops, ip); + xfs_defer_ijoin(&dfops, ip); + error = xfs_defer_finish(&tp, &dfops); if (error) goto error_bmap_cancel; /* From a4f6cf6b2b6b60ec2a05a33a32e65caa4149aa2b Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 29 Aug 2017 10:08:36 -0700 Subject: [PATCH 17/44] xfs: open-code xfs_buf_item_dirty() It checks a single flag and has one caller. It probably isn't worth its own function. Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_buf_item.c | 11 ----------- fs/xfs/xfs_buf_item.h | 1 - fs/xfs/xfs_trans_buf.c | 2 +- 3 files changed, 1 insertion(+), 13 deletions(-) diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 573fc72c3f23b..cdae0ad5e0a5c 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -945,17 +945,6 @@ xfs_buf_item_log( } -/* - * Return 1 if the buffer has been logged or ordered in a transaction (at any - * point, not just the current transaction) and 0 if not. - */ -uint -xfs_buf_item_dirty( - xfs_buf_log_item_t *bip) -{ - return (bip->bli_flags & XFS_BLI_DIRTY); -} - STATIC void xfs_buf_item_free( xfs_buf_log_item_t *bip) diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index 530686e1afb94..e0e744aefaa85 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -64,7 +64,6 @@ typedef struct xfs_buf_log_item { int xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *); void xfs_buf_item_relse(struct xfs_buf *); void xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint); -uint xfs_buf_item_dirty(xfs_buf_log_item_t *); void xfs_buf_attach_iodone(struct xfs_buf *, void(*)(struct xfs_buf *, xfs_log_item_t *), xfs_log_item_t *); diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 86987d823d764..cac8abbeca3f2 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -435,7 +435,7 @@ xfs_trans_brelse(xfs_trans_t *tp, if (XFS_FORCED_SHUTDOWN(tp->t_mountp) && freed) { xfs_trans_ail_remove(&bip->bli_item, SHUTDOWN_LOG_IO_ERROR); xfs_buf_item_relse(bp); - } else if (!xfs_buf_item_dirty(bip)) { + } else if (!(bip->bli_flags & XFS_BLI_DIRTY)) { /*** ASSERT(bp->b_pincount == 0); ***/ From 6453c65d3576bc3e602abb5add15f112755c08ca Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 29 Aug 2017 10:08:37 -0700 Subject: [PATCH 18/44] xfs: remove unnecessary dirty bli format check for ordered bufs xfs_buf_item_unlock() historically checked the dirty state of the buffer by manually checking the buffer log formats for dirty segments. The introduction of ordered buffers invalidated this check because ordered buffers have dirty bli's but no dirty (logged) segments. The check was updated to accommodate ordered buffers by looking at the bli state first and considering the blf only if the bli is clean. This logic is safe but unnecessary. There is no valid case where the bli is clean yet the blf has dirty segments. The bli is set dirty whenever the blf is logged (via xfs_trans_log_buf()) and the blf is cleared in the only place BLI_DIRTY is cleared (xfs_trans_binval()). Remove the conditional blf dirty checks and replace with an assert that should catch any discrepencies between bli and blf dirty states. Refactor the old blf dirty check into a helper function to be used by the assert. Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_buf_item.c | 62 ++++++++++++++++++++++--------------------- fs/xfs/xfs_buf_item.h | 1 + 2 files changed, 33 insertions(+), 30 deletions(-) diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index cdae0ad5e0a5c..ff076d11804a3 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -575,26 +575,18 @@ xfs_buf_item_unlock( { struct xfs_buf_log_item *bip = BUF_ITEM(lip); struct xfs_buf *bp = bip->bli_buf; - bool clean; - bool aborted; - int flags; + bool aborted = !!(lip->li_flags & XFS_LI_ABORTED); + bool hold = !!(bip->bli_flags & XFS_BLI_HOLD); + bool dirty = !!(bip->bli_flags & XFS_BLI_DIRTY); + bool ordered = !!(bip->bli_flags & XFS_BLI_ORDERED); /* Clear the buffer's association with this transaction. */ bp->b_transp = NULL; /* - * If this is a transaction abort, don't return early. Instead, allow - * the brelse to happen. Normally it would be done for stale - * (cancelled) buffers at unpin time, but we'll never go through the - * pin/unpin cycle if we abort inside commit. + * The per-transaction state has been copied above so clear it from the + * bli. */ - aborted = (lip->li_flags & XFS_LI_ABORTED) ? true : false; - /* - * Before possibly freeing the buf item, copy the per-transaction state - * so we can reference it safely later after clearing it from the - * buffer log item. - */ - flags = bip->bli_flags; bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED); /* @@ -602,7 +594,7 @@ xfs_buf_item_unlock( * unlock the buffer and free the buf item when the buffer is unpinned * for the last time. */ - if (flags & XFS_BLI_STALE) { + if (bip->bli_flags & XFS_BLI_STALE) { trace_xfs_buf_item_unlock_stale(bip); ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); if (!aborted) { @@ -620,20 +612,11 @@ xfs_buf_item_unlock( * regardless of whether it is dirty or not. A dirty abort implies a * shutdown, anyway. * - * Ordered buffers are dirty but may have no recorded changes, so ensure - * we only release clean items here. + * The bli dirty state should match whether the blf has logged segments + * except for ordered buffers, where only the bli should be dirty. */ - clean = (flags & XFS_BLI_DIRTY) ? false : true; - if (clean) { - int i; - for (i = 0; i < bip->bli_format_count; i++) { - if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map, - bip->bli_formats[i].blf_map_size)) { - clean = false; - break; - } - } - } + ASSERT((!ordered && dirty == xfs_buf_item_dirty_format(bip)) || + (ordered && dirty && !xfs_buf_item_dirty_format(bip))); /* * Clean buffers, by definition, cannot be in the AIL. However, aborted @@ -652,11 +635,11 @@ xfs_buf_item_unlock( ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp)); xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR); xfs_buf_item_relse(bp); - } else if (clean) + } else if (!dirty) xfs_buf_item_relse(bp); } - if (!(flags & XFS_BLI_HOLD)) + if (!hold) xfs_buf_relse(bp); } @@ -945,6 +928,25 @@ xfs_buf_item_log( } +/* + * Return true if the buffer has any ranges logged/dirtied by a transaction, + * false otherwise. + */ +bool +xfs_buf_item_dirty_format( + struct xfs_buf_log_item *bip) +{ + int i; + + for (i = 0; i < bip->bli_format_count; i++) { + if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map, + bip->bli_formats[i].blf_map_size)) + return true; + } + + return false; +} + STATIC void xfs_buf_item_free( xfs_buf_log_item_t *bip) diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index e0e744aefaa85..9690ce62c9a7f 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -64,6 +64,7 @@ typedef struct xfs_buf_log_item { int xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *); void xfs_buf_item_relse(struct xfs_buf *); void xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint); +bool xfs_buf_item_dirty_format(struct xfs_buf_log_item *); void xfs_buf_attach_iodone(struct xfs_buf *, void(*)(struct xfs_buf *, xfs_log_item_t *), xfs_log_item_t *); From e9385cc6fb7edf23702de33a2dc82965d92d9392 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 29 Aug 2017 10:08:37 -0700 Subject: [PATCH 19/44] xfs: ordered buffer log items are never formatted Ordered buffers pass through the logging infrastructure without ever being written to the log. The way this works is that the ordered buffer status is transferred to the log vector at commit time via the ->iop_size() callback. In xlog_cil_insert_format_items(), ordered log vectors bypass ->iop_format() processing altogether. Therefore it is unnecessary for xfs_buf_item_format() to handle ordered buffers. Remove the unnecessary logic and assert that an ordered buffer never reaches this point. Signed-off-by: Brian Foster Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_buf_item.c | 12 ++---------- fs/xfs/xfs_trace.h | 1 - 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index ff076d11804a3..ef2c1375f0920 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -323,6 +323,8 @@ xfs_buf_item_format( ASSERT((bip->bli_flags & XFS_BLI_STALE) || (xfs_blft_from_flags(&bip->__bli_format) > XFS_BLFT_UNKNOWN_BUF && xfs_blft_from_flags(&bip->__bli_format) < XFS_BLFT_MAX_BUF)); + ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED) || + (bip->bli_flags & XFS_BLI_STALE)); /* @@ -347,16 +349,6 @@ xfs_buf_item_format( bip->bli_flags &= ~XFS_BLI_INODE_BUF; } - if ((bip->bli_flags & (XFS_BLI_ORDERED|XFS_BLI_STALE)) == - XFS_BLI_ORDERED) { - /* - * The buffer has been logged just to order it. It is not being - * included in the transaction commit, so don't format it. - */ - trace_xfs_buf_item_format_ordered(bip); - return; - } - for (i = 0; i < bip->bli_format_count; i++) { xfs_buf_item_format_segment(bip, lv, &vecp, offset, &bip->bli_formats[i]); diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 68810477ef2c2..e839ab4db483b 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -517,7 +517,6 @@ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_ordered); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_stale); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format); -DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_ordered); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_ordered); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin); From 9684010d38eccda733b61106765e9357cf436f65 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 29 Aug 2017 10:08:38 -0700 Subject: [PATCH 20/44] xfs: refactor buffer logging into buffer dirtying helper xfs_trans_log_buf() is responsible for logging the dirty segments of a buffer along with setting all of the necessary state on the transaction, buffer, bli, etc., to ensure that the associated items are marked as dirty and prepared for I/O. We have a couple use cases that need to to dirty a buffer in a transaction without actually logging dirty ranges of the buffer. One existing use case is ordered buffers, which are currently logged with arbitrary ranges to accomplish this even though the content of ordered buffers is never written to the log. Another pending use case is to relog an already dirty buffer across rolled transactions within the deferred operations infrastructure. This is required to prevent a held (XFS_BLI_HOLD) buffer from pinning the tail of the log. Refactor xfs_trans_log_buf() into a new function that contains all of the logic responsible to dirty the transaction, lidp, buffer and bli. This new function can be used in the future for the use cases outlined above. This patch does not introduce functional changes. Signed-off-by: Brian Foster Reviewed-by: Allison Henderson Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_trans.h | 4 +++- fs/xfs/xfs_trans_buf.c | 46 +++++++++++++++++++++++++++--------------- 2 files changed, 33 insertions(+), 17 deletions(-) diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index b25d3d22e2895..afe0af1778c75 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -217,7 +217,9 @@ void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint); void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *); void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int); void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint); -void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint); +void xfs_trans_log_buf(struct xfs_trans *, struct xfs_buf *, uint, + uint); +void xfs_trans_dirty_buf(struct xfs_trans *, struct xfs_buf *); void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint); void xfs_extent_free_init_defer_op(void); diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index cac8abbeca3f2..8c99813e5377d 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -493,25 +493,17 @@ xfs_trans_bhold_release(xfs_trans_t *tp, } /* - * This is called to mark bytes first through last inclusive of the given - * buffer as needing to be logged when the transaction is committed. - * The buffer must already be associated with the given transaction. - * - * First and last are numbers relative to the beginning of this buffer, - * so the first byte in the buffer is numbered 0 regardless of the - * value of b_blkno. + * Mark a buffer dirty in the transaction. */ void -xfs_trans_log_buf(xfs_trans_t *tp, - xfs_buf_t *bp, - uint first, - uint last) +xfs_trans_dirty_buf( + struct xfs_trans *tp, + struct xfs_buf *bp) { - xfs_buf_log_item_t *bip = bp->b_fspriv; + struct xfs_buf_log_item *bip = bp->b_fspriv; ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); - ASSERT(first <= last && last < BBTOB(bp->b_length)); ASSERT(bp->b_iodone == NULL || bp->b_iodone == xfs_buf_iodone_callbacks); @@ -531,8 +523,6 @@ xfs_trans_log_buf(xfs_trans_t *tp, bp->b_iodone = xfs_buf_iodone_callbacks; bip->bli_item.li_cb = xfs_buf_iodone; - trace_xfs_trans_log_buf(bip); - /* * If we invalidated the buffer within this transaction, then * cancel the invalidation now that we're dirtying the buffer @@ -545,15 +535,39 @@ xfs_trans_log_buf(xfs_trans_t *tp, bp->b_flags &= ~XBF_STALE; bip->__bli_format.blf_flags &= ~XFS_BLF_CANCEL; } + bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED; tp->t_flags |= XFS_TRANS_DIRTY; bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY; +} + +/* + * This is called to mark bytes first through last inclusive of the given + * buffer as needing to be logged when the transaction is committed. + * The buffer must already be associated with the given transaction. + * + * First and last are numbers relative to the beginning of this buffer, + * so the first byte in the buffer is numbered 0 regardless of the + * value of b_blkno. + */ +void +xfs_trans_log_buf( + struct xfs_trans *tp, + struct xfs_buf *bp, + uint first, + uint last) +{ + struct xfs_buf_log_item *bip = bp->b_fspriv; + + ASSERT(first <= last && last < BBTOB(bp->b_length)); + + xfs_trans_dirty_buf(tp, bp); /* * If we have an ordered buffer we are not logging any dirty range but * it still needs to be marked dirty and that it has been logged. */ - bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED; + trace_xfs_trans_log_buf(bip); if (!(bip->bli_flags & XFS_BLI_ORDERED)) xfs_buf_item_log(bip, first, last); } From 8dc518dfa7dbd079581269e51074b3c55a65a880 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 29 Aug 2017 10:08:38 -0700 Subject: [PATCH 21/44] xfs: don't log dirty ranges for ordered buffers Ordered buffers are attached to transactions and pushed through the logging infrastructure just like normal buffers with the exception that they are not actually written to the log. Therefore, we don't need to log dirty ranges of ordered buffers. xfs_trans_log_buf() is called on ordered buffers to set up all of the dirty state on the transaction, buffer and log item and prepare the buffer for I/O. Now that xfs_trans_dirty_buf() is available, call it from xfs_trans_ordered_buf() so the latter is now mutually exclusive with xfs_trans_log_buf(). This reflects the implementation of ordered buffers and helps eliminate confusion over the need to log ranges of ordered buffers just to set up internal log state. Signed-off-by: Brian Foster Reviewed-by: Allison Henderson Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_btree.c | 6 ++---- fs/xfs/libxfs/xfs_ialloc.c | 2 -- fs/xfs/xfs_trans_buf.c | 26 ++++++++++++++------------ 3 files changed, 16 insertions(+), 18 deletions(-) diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index e0bcc4a59efda..0b7905a4e27a8 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -4464,12 +4464,10 @@ xfs_btree_block_change_owner( * though, so everything is consistent in memory. */ if (bp) { - if (cur->bc_tp) { + if (cur->bc_tp) xfs_trans_ordered_buf(cur->bc_tp, bp); - xfs_btree_log_block(cur, bp, XFS_BB_OWNER); - } else { + else xfs_buf_delwri_queue(bp, bbcoi->buffer_list); - } } else { ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); ASSERT(level == cur->bc_nlevels - 1); diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 1e0658a3f155a..988bb3f314466 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -378,8 +378,6 @@ xfs_ialloc_inode_init( * transaction and pin the log appropriately. */ xfs_trans_ordered_buf(tp, fbuf); - xfs_trans_log_buf(tp, fbuf, 0, - BBTOB(fbuf->b_length) - 1); } } else { fbuf->b_flags |= XBF_DONE; diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 8c99813e5377d..3089e80153693 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -560,16 +560,12 @@ xfs_trans_log_buf( struct xfs_buf_log_item *bip = bp->b_fspriv; ASSERT(first <= last && last < BBTOB(bp->b_length)); + ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED)); xfs_trans_dirty_buf(tp, bp); - /* - * If we have an ordered buffer we are not logging any dirty range but - * it still needs to be marked dirty and that it has been logged. - */ trace_xfs_trans_log_buf(bip); - if (!(bip->bli_flags & XFS_BLI_ORDERED)) - xfs_buf_item_log(bip, first, last); + xfs_buf_item_log(bip, first, last); } @@ -722,12 +718,11 @@ xfs_trans_inode_alloc_buf( } /* - * Mark the buffer as ordered for this transaction. This means - * that the contents of the buffer are not recorded in the transaction - * but it is tracked in the AIL as though it was. This allows us - * to record logical changes in transactions rather than the physical - * changes we make to the buffer without changing writeback ordering - * constraints of metadata buffers. + * Mark the buffer as ordered for this transaction. This means that the contents + * of the buffer are not recorded in the transaction but it is tracked in the + * AIL as though it was. This allows us to record logical changes in + * transactions rather than the physical changes we make to the buffer without + * changing writeback ordering constraints of metadata buffers. */ void xfs_trans_ordered_buf( @@ -739,9 +734,16 @@ xfs_trans_ordered_buf( ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); + ASSERT(!xfs_buf_item_dirty_format(bip)); bip->bli_flags |= XFS_BLI_ORDERED; trace_xfs_buf_item_ordered(bip); + + /* + * We don't log a dirty range of an ordered buffer but it still needs + * to be marked dirty and that it has been logged. + */ + xfs_trans_dirty_buf(tp, bp); } /* From 99c794c639a65cc7b74f30a674048fd100fe9ac8 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 29 Aug 2017 10:08:39 -0700 Subject: [PATCH 22/44] xfs: skip bmbt block ino validation during owner change Extent swap uses xfs_btree_visit_blocks() to fix up bmbt block owners on v5 (!rmapbt) filesystems. The bmbt scan uses xfs_btree_lookup_get_block() to read bmbt blocks which verifies the current owner of the block against the parent inode of the bmbt. This works during extent swap because the bmbt owners are updated to the opposite inode number before the inode extent forks are swapped. The modified bmbt blocks are marked as ordered buffers which allows everything to commit in a single transaction. If the transaction commits to the log and the system crashes such that recovery of the extent swap is required, log recovery restarts the bmbt scan to fix up any bmbt blocks that may have not been written back before the crash. The log recovery bmbt scan occurs after the inode forks have been swapped, however. This causes the bmbt block owner verification to fail, leads to log recovery failure and requires xfs_repair to zap the log to recover. Define a new invalid inode owner flag to inform the btree block lookup mechanism that the current inode may be invalid with respect to the current owner of the bmbt block. Set this flag on the cursor used for change owner scans to allow this operation to work at runtime and during log recovery. Signed-off-by: Brian Foster Fixes: bb3be7e7c ("xfs: check for bogus values in btree block headers") Cc: stable@vger.kernel.org Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap_btree.c | 1 + fs/xfs/libxfs/xfs_btree.c | 1 + fs/xfs/libxfs/xfs_btree.h | 3 ++- 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 85de225130145..a6331ffa51e3e 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -858,6 +858,7 @@ xfs_bmbt_change_owner( cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork); if (!cur) return -ENOMEM; + cur->bc_private.b.flags |= XFS_BTCUR_BPRV_INVALID_OWNER; error = xfs_btree_change_owner(cur, new_owner, buffer_list); xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 0b7905a4e27a8..1d15d04e29a55 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -1791,6 +1791,7 @@ xfs_btree_lookup_get_block( /* Check the inode owner since the verifiers don't. */ if (xfs_sb_version_hascrc(&cur->bc_mp->m_sb) && + !(cur->bc_private.b.flags & XFS_BTCUR_BPRV_INVALID_OWNER) && (cur->bc_flags & XFS_BTREE_LONG_PTRS) && be64_to_cpu((*blkp)->bb_u.l.bb_owner) != cur->bc_private.b.ip->i_ino) diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 9c95e965cfe53..f2a88c3b11595 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -233,7 +233,8 @@ typedef struct xfs_btree_cur short forksize; /* fork's inode space */ char whichfork; /* data or attr fork */ char flags; /* flags */ -#define XFS_BTCUR_BPRV_WASDEL 1 /* was delayed */ +#define XFS_BTCUR_BPRV_WASDEL (1<<0) /* was delayed */ +#define XFS_BTCUR_BPRV_INVALID_OWNER (1<<1) /* for ext swap */ } b; } bc_private; /* per-btree type data */ } xfs_btree_cur_t; From 6fb10d6d22094bc4062f92b9ccbcee2f54033d04 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 29 Aug 2017 10:08:39 -0700 Subject: [PATCH 23/44] xfs: move bmbt owner change to last step of extent swap The extent swap operation currently resets bmbt block owners before the inode forks are swapped. The bmbt buffers are marked as ordered so they do not have to be physically logged in the transaction. This use of ordered buffers is not safe as bmbt buffers may have been previously physically logged. The bmbt owner change algorithm needs to be updated to physically log buffers that are already dirty when/if they are encountered. This means that an extent swap will eventually require multiple rolling transactions to handle large btrees. In addition, all inode related changes must be logged before the bmbt owner change scan begins and can roll the transaction for the first time to preserve fs consistency via log recovery. In preparation for such fixes to the bmbt owner change algorithm, refactor the bmbt scan out of the extent fork swap code to the last operation before the transaction is committed. Update xfs_swap_extent_forks() to only set the inode log flags when an owner change scan is necessary. Update xfs_swap_extents() to trigger the owner change based on the inode log flags. Note that since the owner change now occurs after the extent fork swap, the inode btrees must be fixed up with the inode number of the current inode (similar to log recovery). Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_bmap_util.c | 44 +++++++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index f7f04aebcadef..1d4fa18d4303b 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1842,29 +1842,18 @@ xfs_swap_extent_forks( } /* - * Before we've swapped the forks, lets set the owners of the forks - * appropriately. We have to do this as we are demand paging the btree - * buffers, and so the validation done on read will expect the owner - * field to be correctly set. Once we change the owners, we can swap the - * inode forks. + * Btree format (v3) inodes have the inode number stamped in the bmbt + * block headers. We can't start changing the bmbt blocks until the + * inode owner change is logged so recovery does the right thing in the + * event of a crash. Set the owner change log flags now and leave the + * bmbt scan as the last step. */ if (ip->i_d.di_version == 3 && - ip->i_d.di_format == XFS_DINODE_FMT_BTREE) { + ip->i_d.di_format == XFS_DINODE_FMT_BTREE) (*target_log_flags) |= XFS_ILOG_DOWNER; - error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, - tip->i_ino, NULL); - if (error) - return error; - } - if (tip->i_d.di_version == 3 && - tip->i_d.di_format == XFS_DINODE_FMT_BTREE) { + tip->i_d.di_format == XFS_DINODE_FMT_BTREE) (*src_log_flags) |= XFS_ILOG_DOWNER; - error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK, - ip->i_ino, NULL); - if (error) - return error; - } /* * Swap the data forks of the inodes @@ -2093,6 +2082,25 @@ xfs_swap_extents( xfs_trans_log_inode(tp, ip, src_log_flags); xfs_trans_log_inode(tp, tip, target_log_flags); + /* + * The extent forks have been swapped, but crc=1,rmapbt=0 filesystems + * have inode number owner values in the bmbt blocks that still refer to + * the old inode. Scan each bmbt to fix up the owner values with the + * inode number of the current inode. + */ + if (src_log_flags & XFS_ILOG_DOWNER) { + error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, + ip->i_ino, NULL); + if (error) + goto out_trans_cancel; + } + if (target_log_flags & XFS_ILOG_DOWNER) { + error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK, + tip->i_ino, NULL); + if (error) + goto out_trans_cancel; + } + /* * If this is a synchronous mount, make sure that the * transaction goes to disk before returning to the user. From a5814bceea48ee1c57c4db2bd54b0c0246daf54a Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 29 Aug 2017 10:08:40 -0700 Subject: [PATCH 24/44] xfs: disallow marking previously dirty buffers as ordered Ordered buffers are used in situations where the buffer is not physically logged but must pass through the transaction/logging pipeline for a particular transaction. As a result, ordered buffers are not unpinned and written back until the transaction commits to the log. Ordered buffers have a strict requirement that the target buffer must not be currently dirty and resident in the log pipeline at the time it is marked ordered. If a dirty+ordered buffer is committed, the buffer is reinserted to the AIL but not physically relogged at the LSN of the associated checkpoint. The buffer log item is assigned the LSN of the latest checkpoint and the AIL effectively releases the previously logged buffer content from the active log before the buffer has been written back. If the tail pushes forward and a filesystem crash occurs while in this state, an inconsistent filesystem could result. It is currently the caller responsibility to ensure an ordered buffer is not already dirty from a previous modification. This is unclear and error prone when not used in situations where it is guaranteed a buffer has not been previously modified (such as new metadata allocations). To facilitate general purpose use of ordered buffers, update xfs_trans_ordered_buf() to conditionally order the buffer based on state of the log item and return the status of the result. If the bli is dirty, do not order the buffer and return false. The caller must either physically log the buffer (having acquired the appropriate log reservation) or push it from the AIL to clean it before it can be marked ordered in the current transaction. Note that ordered buffers are currently only used in two situations: 1.) inode chunk allocation where previously logged buffers are not possible and 2.) extent swap which will be updated to handle ordered buffer failures in a separate patch. Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_trans.h | 2 +- fs/xfs/xfs_trans_buf.c | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index afe0af1778c75..815b53d20e262 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -212,7 +212,7 @@ void xfs_trans_bhold_release(xfs_trans_t *, struct xfs_buf *); void xfs_trans_binval(xfs_trans_t *, struct xfs_buf *); void xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *); void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *); -void xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *); +bool xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *); void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint); void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *); void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int); diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 3089e80153693..3ba7a96a8abdb 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -724,7 +724,7 @@ xfs_trans_inode_alloc_buf( * transactions rather than the physical changes we make to the buffer without * changing writeback ordering constraints of metadata buffers. */ -void +bool xfs_trans_ordered_buf( struct xfs_trans *tp, struct xfs_buf *bp) @@ -734,7 +734,9 @@ xfs_trans_ordered_buf( ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); - ASSERT(!xfs_buf_item_dirty_format(bip)); + + if (xfs_buf_item_dirty_format(bip)) + return false; bip->bli_flags |= XFS_BLI_ORDERED; trace_xfs_buf_item_ordered(bip); @@ -744,6 +746,7 @@ xfs_trans_ordered_buf( * to be marked dirty and that it has been logged. */ xfs_trans_dirty_buf(tp, bp); + return true; } /* From 2dd3d709fc4338681a3aa61658122fa8faa5a437 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 29 Aug 2017 10:08:40 -0700 Subject: [PATCH 25/44] xfs: relog dirty buffers during swapext bmbt owner change The owner change bmbt scan that occurs during extent swap operations does not handle ordered buffer failures. Buffers that cannot be marked ordered must be physically logged so previously dirty ranges of the buffer can be relogged in the transaction. Since the bmbt scan may need to process and potentially log a large number of blocks, we can't expect to complete this operation in a single transaction. Update extent swap to use a permanent transaction with enough log reservation to physically log a buffer. Update the bmbt scan to physically log any buffers that cannot be ordered and to terminate the scan with -EAGAIN. On -EAGAIN, the caller rolls the transaction and restarts the scan. Finally, update the bmbt scan helper function to skip bmbt blocks that already match the expected owner so they are not reprocessed after scan restarts. Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig [darrick: fix the xfs_trans_roll call] Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_btree.c | 26 ++++++++++++------ fs/xfs/xfs_bmap_util.c | 57 ++++++++++++++++++++++++++++++++------- 2 files changed, 65 insertions(+), 18 deletions(-) diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 1d15d04e29a55..5bfb88261c7e9 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -4452,10 +4452,15 @@ xfs_btree_block_change_owner( /* modify the owner */ block = xfs_btree_get_block(cur, level, &bp); - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (block->bb_u.l.bb_owner == cpu_to_be64(bbcoi->new_owner)) + return 0; block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner); - else + } else { + if (block->bb_u.s.bb_owner == cpu_to_be32(bbcoi->new_owner)) + return 0; block->bb_u.s.bb_owner = cpu_to_be32(bbcoi->new_owner); + } /* * If the block is a root block hosted in an inode, we might not have a @@ -4464,14 +4469,19 @@ xfs_btree_block_change_owner( * block is formatted into the on-disk inode fork. We still change it, * though, so everything is consistent in memory. */ - if (bp) { - if (cur->bc_tp) - xfs_trans_ordered_buf(cur->bc_tp, bp); - else - xfs_buf_delwri_queue(bp, bbcoi->buffer_list); - } else { + if (!bp) { ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); ASSERT(level == cur->bc_nlevels - 1); + return 0; + } + + if (cur->bc_tp) { + if (!xfs_trans_ordered_buf(cur->bc_tp, bp)) { + xfs_btree_log_block(cur, bp, XFS_BB_OWNER); + return -EAGAIN; + } + } else { + xfs_buf_delwri_queue(bp, bbcoi->buffer_list); } return 0; diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 1d4fa18d4303b..8661be0aacaae 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1931,6 +1931,48 @@ xfs_swap_extent_forks( return 0; } +/* + * Fix up the owners of the bmbt blocks to refer to the current inode. The + * change owner scan attempts to order all modified buffers in the current + * transaction. In the event of ordered buffer failure, the offending buffer is + * physically logged as a fallback and the scan returns -EAGAIN. We must roll + * the transaction in this case to replenish the fallback log reservation and + * restart the scan. This process repeats until the scan completes. + */ +static int +xfs_swap_change_owner( + struct xfs_trans **tpp, + struct xfs_inode *ip, + struct xfs_inode *tmpip) +{ + int error; + struct xfs_trans *tp = *tpp; + + do { + error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, ip->i_ino, + NULL); + /* success or fatal error */ + if (error != -EAGAIN) + break; + + error = xfs_trans_roll(tpp); + if (error) + break; + tp = *tpp; + + /* + * Redirty both inodes so they can relog and keep the log tail + * moving forward. + */ + xfs_trans_ijoin(tp, ip, 0); + xfs_trans_ijoin(tp, tmpip, 0); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + xfs_trans_log_inode(tp, tmpip, XFS_ILOG_CORE); + } while (true); + + return error; +} + int xfs_swap_extents( struct xfs_inode *ip, /* target inode */ @@ -1945,7 +1987,7 @@ xfs_swap_extents( int lock_flags; struct xfs_ifork *cowfp; uint64_t f; - int resblks; + int resblks = 0; /* * Lock the inodes against other IO, page faults and truncate to @@ -1993,11 +2035,8 @@ xfs_swap_extents( XFS_SWAP_RMAP_SPACE_RES(mp, XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK), XFS_DATA_FORK); - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, - 0, 0, &tp); - } else - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, - 0, 0, &tp); + } + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); if (error) goto out_unlock; @@ -2089,14 +2128,12 @@ xfs_swap_extents( * inode number of the current inode. */ if (src_log_flags & XFS_ILOG_DOWNER) { - error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, - ip->i_ino, NULL); + error = xfs_swap_change_owner(&tp, ip, tip); if (error) goto out_trans_cancel; } if (target_log_flags & XFS_ILOG_DOWNER) { - error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK, - tip->i_ino, NULL); + error = xfs_swap_change_owner(&tp, tip, ip); if (error) goto out_trans_cancel; } From e7647fb49167809502724eb3c402cea77716fc67 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 29 Aug 2017 10:08:41 -0700 Subject: [PATCH 26/44] iomap: return VM_FAULT_* codes from iomap_page_mkwrite All callers will need the VM_FAULT_* flags, so convert in the helper. Signed-off-by: Christoph Hellwig Reviewed-by: Ross Zwisler Reviewed-by: Jan Kara Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/iomap.c | 4 ++-- fs/xfs/xfs_file.c | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/iomap.c b/fs/iomap.c index 59cc98ad7577b..8554a8d752812 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -477,10 +477,10 @@ int iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) set_page_dirty(page); wait_for_stable_page(page); - return 0; + return VM_FAULT_LOCKED; out_unlock: unlock_page(page); - return ret; + return block_page_mkwrite_return(ret); } EXPORT_SYMBOL_GPL(iomap_page_mkwrite); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index c4893e226fd8e..8b0181c6d2a6e 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1035,7 +1035,6 @@ xfs_filemap_page_mkwrite( ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops); } else { ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops); - ret = block_page_mkwrite_return(ret); } xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); From d522d569d6adf72ceda90153a086e089e6c2fbc6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 29 Aug 2017 10:08:41 -0700 Subject: [PATCH 27/44] xfs: consolidate the various page fault handlers Add a new __xfs_filemap_fault helper that implements all four page fault callouts, and make these methods themselves small stubs that set the correct write_fault flag, and exit early for the non-DAX case for the hugepage related ones. Also remove the extra size checking in the pfn_fault path, which is now handled in the core DAX code. Life would be so much simpler if we only had one method for all this. Signed-off-by: Christoph Hellwig Reviewed-by: Ross Zwisler Reviewed-by: Jan Kara Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_file.c | 96 ++++++++++++++++------------------------------ fs/xfs/xfs_trace.h | 29 ++++++++++++-- 2 files changed, 60 insertions(+), 65 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 8b0181c6d2a6e..0debbc7e3f039 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1011,95 +1011,67 @@ xfs_file_llseek( * page_lock (MM) * i_lock (XFS - extent map serialisation) */ - -/* - * mmap()d file has taken write protection fault and is being made writable. We - * can set the page state up correctly for a writable page, which means we can - * do correct delalloc accounting (ENOSPC checking!) and unwritten extent - * mapping. - */ -STATIC int -xfs_filemap_page_mkwrite( - struct vm_fault *vmf) +static int +__xfs_filemap_fault( + struct vm_fault *vmf, + enum page_entry_size pe_size, + bool write_fault) { struct inode *inode = file_inode(vmf->vma->vm_file); + struct xfs_inode *ip = XFS_I(inode); int ret; - trace_xfs_filemap_page_mkwrite(XFS_I(inode)); + trace_xfs_filemap_fault(ip, pe_size, write_fault); - sb_start_pagefault(inode->i_sb); - file_update_time(vmf->vma->vm_file); - xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); + if (write_fault) { + sb_start_pagefault(inode->i_sb); + file_update_time(vmf->vma->vm_file); + } + xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (IS_DAX(inode)) { - ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops); + ret = dax_iomap_fault(vmf, pe_size, &xfs_iomap_ops); } else { - ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops); + if (write_fault) + ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops); + else + ret = filemap_fault(vmf); } - xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - sb_end_pagefault(inode->i_sb); + if (write_fault) + sb_end_pagefault(inode->i_sb); return ret; } -STATIC int +static int xfs_filemap_fault( struct vm_fault *vmf) { - struct inode *inode = file_inode(vmf->vma->vm_file); - int ret; - - trace_xfs_filemap_fault(XFS_I(inode)); - /* DAX can shortcut the normal fault path on write faults! */ - if ((vmf->flags & FAULT_FLAG_WRITE) && IS_DAX(inode)) - return xfs_filemap_page_mkwrite(vmf); - - xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - if (IS_DAX(inode)) - ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops); - else - ret = filemap_fault(vmf); - xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - - return ret; + return __xfs_filemap_fault(vmf, PE_SIZE_PTE, + IS_DAX(file_inode(vmf->vma->vm_file)) && + (vmf->flags & FAULT_FLAG_WRITE)); } -/* - * Similar to xfs_filemap_fault(), the DAX fault path can call into here on - * both read and write faults. Hence we need to handle both cases. There is no - * ->huge_mkwrite callout for huge pages, so we have a single function here to - * handle both cases here. @flags carries the information on the type of fault - * occuring. - */ -STATIC int +static int xfs_filemap_huge_fault( struct vm_fault *vmf, enum page_entry_size pe_size) { - struct inode *inode = file_inode(vmf->vma->vm_file); - struct xfs_inode *ip = XFS_I(inode); - int ret; - - if (!IS_DAX(inode)) + if (!IS_DAX(file_inode(vmf->vma->vm_file))) return VM_FAULT_FALLBACK; - trace_xfs_filemap_huge_fault(ip); - - if (vmf->flags & FAULT_FLAG_WRITE) { - sb_start_pagefault(inode->i_sb); - file_update_time(vmf->vma->vm_file); - } - - xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - ret = dax_iomap_fault(vmf, pe_size, &xfs_iomap_ops); - xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - - if (vmf->flags & FAULT_FLAG_WRITE) - sb_end_pagefault(inode->i_sb); + /* DAX can shortcut the normal fault path on write faults! */ + return __xfs_filemap_fault(vmf, pe_size, + (vmf->flags & FAULT_FLAG_WRITE)); +} - return ret; +static int +xfs_filemap_page_mkwrite( + struct vm_fault *vmf) +{ + return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true); } /* diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index e839ab4db483b..bb5514688d470 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -688,11 +688,34 @@ DEFINE_INODE_EVENT(xfs_inode_set_cowblocks_tag); DEFINE_INODE_EVENT(xfs_inode_clear_cowblocks_tag); DEFINE_INODE_EVENT(xfs_inode_free_cowblocks_invalid); -DEFINE_INODE_EVENT(xfs_filemap_fault); -DEFINE_INODE_EVENT(xfs_filemap_huge_fault); -DEFINE_INODE_EVENT(xfs_filemap_page_mkwrite); DEFINE_INODE_EVENT(xfs_filemap_pfn_mkwrite); +TRACE_EVENT(xfs_filemap_fault, + TP_PROTO(struct xfs_inode *ip, enum page_entry_size pe_size, + bool write_fault), + TP_ARGS(ip, pe_size, write_fault), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(enum page_entry_size, pe_size) + __field(bool, write_fault) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->pe_size = pe_size; + __entry->write_fault = write_fault; + ), + TP_printk("dev %d:%d ino 0x%llx %s write_fault %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_symbolic(__entry->pe_size, + { PE_SIZE_PTE, "PTE" }, + { PE_SIZE_PMD, "PMD" }, + { PE_SIZE_PUD, "PUD" }), + __entry->write_fault) +) + DECLARE_EVENT_CLASS(xfs_iref_class, TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), TP_ARGS(ip, caller_ip), From 67e4e69cb2a7afbffdefd1a0a23a94d1d706c38f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 29 Aug 2017 15:44:11 -0700 Subject: [PATCH 28/44] xfs: add a xfs_iext_update_extent helper This helper is used to update an extent record based on the extent index, and can be used to provide a level of abstractions between callers that want to modify in-core extent records and the details of the extent list implementation. Also switch all users of the xfs_bmbt_set_all(xfs_iext_get_ext(...)) pattern to this new helper. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap.c | 12 ++++++------ fs/xfs/libxfs/xfs_inode_fork.c | 12 ++++++++++++ fs/xfs/libxfs/xfs_inode_fork.h | 2 ++ 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 28ac796abe455..ea1a61c7ace8e 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4918,7 +4918,7 @@ xfs_bmap_del_extent_delay( da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, got->br_blockcount), da_old); got->br_startblock = nullstartblock((int)da_new); - xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got); + xfs_iext_update_extent(ifp, *idx, got); trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); break; case BMAP_RIGHT_CONTIG: @@ -4930,7 +4930,7 @@ xfs_bmap_del_extent_delay( da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, got->br_blockcount), da_old); got->br_startblock = nullstartblock((int)da_new); - xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got); + xfs_iext_update_extent(ifp, *idx, got); trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); break; case 0: @@ -4956,7 +4956,7 @@ xfs_bmap_del_extent_delay( del->br_blockcount); got->br_startblock = nullstartblock((int)got_indlen); - xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got); + xfs_iext_update_extent(ifp, *idx, got); trace_xfs_bmap_post_update(ip, *idx, 0, _THIS_IP_); new.br_startoff = del_endoff; @@ -5026,7 +5026,7 @@ xfs_bmap_del_extent_cow( got->br_startoff = del_endoff; got->br_blockcount -= del->br_blockcount; got->br_startblock = del->br_startblock + del->br_blockcount; - xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got); + xfs_iext_update_extent(ifp, *idx, got); trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); break; case BMAP_RIGHT_CONTIG: @@ -5035,7 +5035,7 @@ xfs_bmap_del_extent_cow( */ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); got->br_blockcount -= del->br_blockcount; - xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got); + xfs_iext_update_extent(ifp, *idx, got); trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); break; case 0: @@ -5044,7 +5044,7 @@ xfs_bmap_del_extent_cow( */ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); got->br_blockcount = del->br_startoff - got->br_startoff; - xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got); + xfs_iext_update_extent(ifp, *idx, got); trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); new.br_startoff = del_endoff; diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 0e80f34fe97c5..fb310d08dc821 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -2023,3 +2023,15 @@ xfs_iext_get_extent( xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), gotp); return true; } + +void +xfs_iext_update_extent( + struct xfs_ifork *ifp, + xfs_extnum_t idx, + struct xfs_bmbt_irec *gotp) +{ + ASSERT(idx >= 0); + ASSERT(idx < xfs_iext_count(ifp)); + + xfs_bmbt_set_all(xfs_iext_get_ext(ifp, idx), gotp); +} diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 7fb8365326d1a..11af705219f6d 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -187,6 +187,8 @@ bool xfs_iext_lookup_extent(struct xfs_inode *ip, xfs_extnum_t *idxp, struct xfs_bmbt_irec *gotp); bool xfs_iext_get_extent(struct xfs_ifork *ifp, xfs_extnum_t idx, struct xfs_bmbt_irec *gotp); +void xfs_iext_update_extent(struct xfs_ifork *ifp, xfs_extnum_t idx, + struct xfs_bmbt_irec *gotp); extern struct kmem_zone *xfs_ifork_zone; From 50bb44c28614205def9e711190842b4c0242ae79 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 29 Aug 2017 15:44:11 -0700 Subject: [PATCH 29/44] xfs: switch xfs_bmap_local_to_extents to use xfs_iext_insert Use the helper instead of open coding it, to provide a better abstraction for the scalable extent list work. This also gets an additional assert and trace point for free. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index ea1a61c7ace8e..94f2a222bd952 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -880,7 +880,7 @@ xfs_bmap_local_to_extents( xfs_ifork_t *ifp; /* inode fork pointer */ xfs_alloc_arg_t args; /* allocation arguments */ xfs_buf_t *bp; /* buffer for extent block */ - xfs_bmbt_rec_host_t *ep; /* extent record pointer */ + struct xfs_bmbt_irec rec; /* * We don't want to deal with the case of keeping inode data inline yet. @@ -943,9 +943,12 @@ xfs_bmap_local_to_extents( xfs_bmap_local_to_extents_empty(ip, whichfork); flags |= XFS_ILOG_CORE; - xfs_iext_add(ifp, 0, 1); - ep = xfs_iext_get_ext(ifp, 0); - xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM); + rec.br_startoff = 0; + rec.br_startblock = args.fsbno; + rec.br_blockcount = 1; + rec.br_state = XFS_EXT_NORM; + xfs_iext_insert(ip, 0, 1, &rec, 0); + trace_xfs_bmap_post_update(ip, 0, whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0, _THIS_IP_); From f2285c148c4167337d12452bebccadd2ad821d5d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 29 Aug 2017 15:44:12 -0700 Subject: [PATCH 30/44] xfs: use xfs_iext_get_extent in xfs_bmap_first_unused Use the bmap abstraction instead of open-coding bmbt details here. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 94f2a222bd952..99b5fa5c4b8d4 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1359,7 +1359,6 @@ xfs_bmap_first_unused( xfs_fileoff_t lastaddr; /* last block number seen */ xfs_fileoff_t lowest; /* lowest useful block */ xfs_fileoff_t max; /* starting useful block */ - xfs_fileoff_t off; /* offset for this block */ xfs_extnum_t nextents; /* number of extent entries */ ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE || @@ -1376,16 +1375,19 @@ xfs_bmap_first_unused( lowest = *first_unused; nextents = xfs_iext_count(ifp); for (idx = 0, lastaddr = 0, max = lowest; idx < nextents; idx++) { - xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx); - off = xfs_bmbt_get_startoff(ep); + struct xfs_bmbt_irec got; + + xfs_iext_get_extent(ifp, idx, &got); + /* * See if the hole before this extent will work. */ - if (off >= lowest + len && off - max >= len) { + if (got.br_startoff >= lowest + len && + got.br_startoff - max >= len) { *first_unused = max; return 0; } - lastaddr = off + xfs_bmbt_get_blockcount(ep); + lastaddr = got.br_startoff + got.br_blockcount; max = XFS_FILEOFF_MAX(lastaddr, lowest); } *first_unused = max; From 05b7c8ab2be71e6fef4615451e7af1bc79ffdf29 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 29 Aug 2017 15:44:12 -0700 Subject: [PATCH 31/44] xfs: move some code around inside xfs_bmap_shift_extents For the first right move we need to look up next_fsb. That means our last fsb that contains next_fsb must also be the current extent, so take advantage of that by moving the code around a bit. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap.c | 54 +++++++++++++++++++++------------------- 1 file changed, 29 insertions(+), 25 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 99b5fa5c4b8d4..e529bed6be412 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -6127,7 +6127,6 @@ xfs_bmap_shift_extents( ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(direction == SHIFT_LEFT || direction == SHIFT_RIGHT); - ASSERT(*next_fsb != NULLFSBLOCK || direction == SHIFT_RIGHT); ifp = XFS_IFORK_PTR(ip, whichfork); if (!(ifp->if_flags & XFS_IFEXTENTS)) { @@ -6159,43 +6158,48 @@ xfs_bmap_shift_extents( * In case of first right shift, we need to initialize next_fsb */ if (*next_fsb == NULLFSBLOCK) { - gotp = xfs_iext_get_ext(ifp, total_extents - 1); + ASSERT(direction == SHIFT_RIGHT); + + current_ext = total_extents - 1; + gotp = xfs_iext_get_ext(ifp, current_ext); xfs_bmbt_get_all(gotp, &got); *next_fsb = got.br_startoff; if (stop_fsb > *next_fsb) { *done = 1; goto del_cursor; } + } else { + /* + * Look up the extent index for the fsb where we start shifting. We can + * henceforth iterate with current_ext as extent list changes are locked + * out via ilock. + * + * gotp can be null in 2 cases: 1) if there are no extents or 2) + * *next_fsb lies in a hole beyond which there are no extents. Either + * way, we are done. + */ + gotp = xfs_iext_bno_to_ext(ifp, *next_fsb, ¤t_ext); + if (!gotp) { + *done = 1; + goto del_cursor; + } } /* Lookup the extent index at which we have to stop */ if (direction == SHIFT_RIGHT) { - gotp = xfs_iext_bno_to_ext(ifp, stop_fsb, &stop_extent); + xfs_iext_bno_to_ext(ifp, stop_fsb, &stop_extent); /* Make stop_extent exclusive of shift range */ stop_extent--; - } else + if (current_ext <= stop_extent) { + error = -EIO; + goto del_cursor; + } + } else { stop_extent = total_extents; - - /* - * Look up the extent index for the fsb where we start shifting. We can - * henceforth iterate with current_ext as extent list changes are locked - * out via ilock. - * - * gotp can be null in 2 cases: 1) if there are no extents or 2) - * *next_fsb lies in a hole beyond which there are no extents. Either - * way, we are done. - */ - gotp = xfs_iext_bno_to_ext(ifp, *next_fsb, ¤t_ext); - if (!gotp) { - *done = 1; - goto del_cursor; - } - - /* some sanity checking before we finally start shifting extents */ - if ((direction == SHIFT_LEFT && current_ext >= stop_extent) || - (direction == SHIFT_RIGHT && current_ext <= stop_extent)) { - error = -EIO; - goto del_cursor; + if (current_ext >= stop_extent) { + error = -EIO; + goto del_cursor; + } } while (nexts++ < num_exts) { From 4da6b514eaa168c246fc5c1245c4f82084bcf24e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 29 Aug 2017 15:44:13 -0700 Subject: [PATCH 32/44] xfs: use xfs_iext_*_extent helpers in xfs_bmap_shift_extents This abstracts the function away from details of the low-level extent list implementation. Note that it seems like the previous implementation of rmap for the merge case was completely broken, but it no seems appear to trigger that. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap.c | 180 +++++++++++++++++++-------------------- 1 file changed, 88 insertions(+), 92 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index e529bed6be412..88beac28258ee 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -5881,32 +5881,26 @@ xfs_bmse_merge( int whichfork, xfs_fileoff_t shift, /* shift fsb */ int current_ext, /* idx of gotp */ - struct xfs_bmbt_rec_host *gotp, /* extent to shift */ - struct xfs_bmbt_rec_host *leftp, /* preceding extent */ + struct xfs_bmbt_irec *got, /* extent to shift */ + struct xfs_bmbt_irec *left, /* preceding extent */ struct xfs_btree_cur *cur, - int *logflags) /* output */ + int *logflags, /* output */ + struct xfs_defer_ops *dfops) { - struct xfs_bmbt_irec got; - struct xfs_bmbt_irec left; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_bmbt_irec new; xfs_filblks_t blockcount; int error, i; struct xfs_mount *mp = ip->i_mount; - xfs_bmbt_get_all(gotp, &got); - xfs_bmbt_get_all(leftp, &left); - blockcount = left.br_blockcount + got.br_blockcount; + blockcount = left->br_blockcount + got->br_blockcount; ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - ASSERT(xfs_bmse_can_merge(&left, &got, shift)); + ASSERT(xfs_bmse_can_merge(left, got, shift)); - /* - * Merge the in-core extents. Note that the host record pointers and - * current_ext index are invalid once the extent has been removed via - * xfs_iext_remove(). - */ - xfs_bmbt_set_blockcount(leftp, blockcount); - xfs_iext_remove(ip, current_ext, 1, 0); + new = *left; + new.br_blockcount = blockcount; /* * Update the on-disk extent count, the btree if necessary and log the @@ -5917,12 +5911,12 @@ xfs_bmse_merge( *logflags |= XFS_ILOG_CORE; if (!cur) { *logflags |= XFS_ILOG_DEXT; - return 0; + goto done; } /* lookup and remove the extent to merge */ - error = xfs_bmbt_lookup_eq(cur, got.br_startoff, got.br_startblock, - got.br_blockcount, &i); + error = xfs_bmbt_lookup_eq(cur, got->br_startoff, got->br_startblock, + got->br_blockcount, &i); if (error) return error; XFS_WANT_CORRUPTED_RETURN(mp, i == 1); @@ -5933,16 +5927,29 @@ xfs_bmse_merge( XFS_WANT_CORRUPTED_RETURN(mp, i == 1); /* lookup and update size of the previous extent */ - error = xfs_bmbt_lookup_eq(cur, left.br_startoff, left.br_startblock, - left.br_blockcount, &i); + error = xfs_bmbt_lookup_eq(cur, left->br_startoff, left->br_startblock, + left->br_blockcount, &i); if (error) return error; XFS_WANT_CORRUPTED_RETURN(mp, i == 1); - left.br_blockcount = blockcount; + error = xfs_bmbt_update(cur, new.br_startoff, new.br_startblock, + new.br_blockcount, new.br_state); + if (error) + return error; - return xfs_bmbt_update(cur, left.br_startoff, left.br_startblock, - left.br_blockcount, left.br_state); +done: + xfs_iext_update_extent(ifp, current_ext - 1, &new); + xfs_iext_remove(ip, current_ext, 1, 0); + + /* update reverse mapping */ + error = xfs_rmap_unmap_extent(mp, dfops, ip, whichfork, got); + if (error) + return error; + error = xfs_rmap_unmap_extent(mp, dfops, ip, whichfork, left); + if (error) + return error; + return xfs_rmap_map_extent(mp, dfops, ip, whichfork, &new); } /* @@ -5954,7 +5961,7 @@ xfs_bmse_shift_one( int whichfork, xfs_fileoff_t offset_shift_fsb, int *current_ext, - struct xfs_bmbt_rec_host *gotp, + struct xfs_bmbt_irec *got, struct xfs_btree_cur *cur, int *logflags, enum shift_direction direction, @@ -5963,9 +5970,7 @@ xfs_bmse_shift_one( struct xfs_ifork *ifp; struct xfs_mount *mp; xfs_fileoff_t startoff; - struct xfs_bmbt_rec_host *adj_irecp; - struct xfs_bmbt_irec got; - struct xfs_bmbt_irec adj_irec; + struct xfs_bmbt_irec adj_irec, new; int error; int i; int total_extents; @@ -5974,13 +5979,11 @@ xfs_bmse_shift_one( ifp = XFS_IFORK_PTR(ip, whichfork); total_extents = xfs_iext_count(ifp); - xfs_bmbt_get_all(gotp, &got); - /* delalloc extents should be prevented by caller */ - XFS_WANT_CORRUPTED_RETURN(mp, !isnullstartblock(got.br_startblock)); + XFS_WANT_CORRUPTED_RETURN(mp, !isnullstartblock(got->br_startblock)); if (direction == SHIFT_LEFT) { - startoff = got.br_startoff - offset_shift_fsb; + startoff = got->br_startoff - offset_shift_fsb; /* * Check for merge if we've got an extent to the left, @@ -5988,46 +5991,39 @@ xfs_bmse_shift_one( * of the file for the shift. */ if (!*current_ext) { - if (got.br_startoff < offset_shift_fsb) + if (got->br_startoff < offset_shift_fsb) return -EINVAL; goto update_current_ext; } + /* - * grab the left extent and check for a large - * enough hole. + * grab the left extent and check for a large enough hole. */ - adj_irecp = xfs_iext_get_ext(ifp, *current_ext - 1); - xfs_bmbt_get_all(adj_irecp, &adj_irec); - - if (startoff < - adj_irec.br_startoff + adj_irec.br_blockcount) + xfs_iext_get_extent(ifp, *current_ext - 1, &adj_irec); + if (startoff < adj_irec.br_startoff + adj_irec.br_blockcount) return -EINVAL; /* check whether to merge the extent or shift it down */ - if (xfs_bmse_can_merge(&adj_irec, &got, - offset_shift_fsb)) { - error = xfs_bmse_merge(ip, whichfork, offset_shift_fsb, - *current_ext, gotp, adj_irecp, - cur, logflags); - if (error) - return error; - adj_irec = got; - goto update_rmap; + if (xfs_bmse_can_merge(&adj_irec, got, offset_shift_fsb)) { + return xfs_bmse_merge(ip, whichfork, offset_shift_fsb, + *current_ext, got, &adj_irec, + cur, logflags, dfops); } } else { - startoff = got.br_startoff + offset_shift_fsb; + startoff = got->br_startoff + offset_shift_fsb; /* nothing to move if this is the last extent */ if (*current_ext >= (total_extents - 1)) goto update_current_ext; + /* * If this is not the last extent in the file, make sure there * is enough room between current extent and next extent for * accommodating the shift. */ - adj_irecp = xfs_iext_get_ext(ifp, *current_ext + 1); - xfs_bmbt_get_all(adj_irecp, &adj_irec); - if (startoff + got.br_blockcount > adj_irec.br_startoff) + xfs_iext_get_extent(ifp, *current_ext + 1, &adj_irec); + if (startoff + got->br_blockcount > adj_irec.br_startoff) return -EINVAL; + /* * Unlike a left shift (which involves a hole punch), * a right shift does not modify extent neighbors @@ -6035,45 +6031,48 @@ xfs_bmse_shift_one( * in this scenario. Check anyways and warn if we * encounter two extents that could be one. */ - if (xfs_bmse_can_merge(&got, &adj_irec, offset_shift_fsb)) + if (xfs_bmse_can_merge(got, &adj_irec, offset_shift_fsb)) WARN_ON_ONCE(1); } + /* * Increment the extent index for the next iteration, update the start * offset of the in-core extent and update the btree if applicable. */ update_current_ext: - if (direction == SHIFT_LEFT) - (*current_ext)++; - else - (*current_ext)--; - xfs_bmbt_set_startoff(gotp, startoff); *logflags |= XFS_ILOG_CORE; - adj_irec = got; - if (!cur) { + + new = *got; + new.br_startoff = startoff; + + if (cur) { + error = xfs_bmbt_lookup_eq(cur, got->br_startoff, + got->br_startblock, got->br_blockcount, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + + error = xfs_bmbt_update(cur, new.br_startoff, + new.br_startblock, new.br_blockcount, + new.br_state); + if (error) + return error; + } else { *logflags |= XFS_ILOG_DEXT; - goto update_rmap; } - error = xfs_bmbt_lookup_eq(cur, got.br_startoff, got.br_startblock, - got.br_blockcount, &i); - if (error) - return error; - XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + xfs_iext_update_extent(ifp, *current_ext, &new); - got.br_startoff = startoff; - error = xfs_bmbt_update(cur, got.br_startoff, got.br_startblock, - got.br_blockcount, got.br_state); - if (error) - return error; + if (direction == SHIFT_LEFT) + (*current_ext)++; + else + (*current_ext)--; -update_rmap: /* update reverse mapping */ - error = xfs_rmap_unmap_extent(mp, dfops, ip, whichfork, &adj_irec); + error = xfs_rmap_unmap_extent(mp, dfops, ip, whichfork, got); if (error) return error; - adj_irec.br_startoff = startoff; - return xfs_rmap_map_extent(mp, dfops, ip, whichfork, &adj_irec); + return xfs_rmap_map_extent(mp, dfops, ip, whichfork, &new); } /* @@ -6100,7 +6099,6 @@ xfs_bmap_shift_extents( int num_exts) { struct xfs_btree_cur *cur = NULL; - struct xfs_bmbt_rec_host *gotp; struct xfs_bmbt_irec got; struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp; @@ -6161,25 +6159,23 @@ xfs_bmap_shift_extents( ASSERT(direction == SHIFT_RIGHT); current_ext = total_extents - 1; - gotp = xfs_iext_get_ext(ifp, current_ext); - xfs_bmbt_get_all(gotp, &got); - *next_fsb = got.br_startoff; - if (stop_fsb > *next_fsb) { + xfs_iext_get_extent(ifp, current_ext, &got); + if (stop_fsb > got.br_startoff) { *done = 1; goto del_cursor; } + *next_fsb = got.br_startoff; } else { /* * Look up the extent index for the fsb where we start shifting. We can * henceforth iterate with current_ext as extent list changes are locked * out via ilock. * - * gotp can be null in 2 cases: 1) if there are no extents or 2) - * *next_fsb lies in a hole beyond which there are no extents. Either - * way, we are done. + * If next_fsb lies in a hole beyond which there are no extents we are + * done. */ - gotp = xfs_iext_bno_to_ext(ifp, *next_fsb, ¤t_ext); - if (!gotp) { + if (!xfs_iext_lookup_extent(ip, ifp, *next_fsb, ¤t_ext, + &got)) { *done = 1; goto del_cursor; } @@ -6187,7 +6183,9 @@ xfs_bmap_shift_extents( /* Lookup the extent index at which we have to stop */ if (direction == SHIFT_RIGHT) { - xfs_iext_bno_to_ext(ifp, stop_fsb, &stop_extent); + struct xfs_bmbt_irec s; + + xfs_iext_lookup_extent(ip, ifp, stop_fsb, &stop_extent, &s); /* Make stop_extent exclusive of shift range */ stop_extent--; if (current_ext <= stop_extent) { @@ -6204,7 +6202,7 @@ xfs_bmap_shift_extents( while (nexts++ < num_exts) { error = xfs_bmse_shift_one(ip, whichfork, offset_shift_fsb, - ¤t_ext, gotp, cur, &logflags, + ¤t_ext, &got, cur, &logflags, direction, dfops); if (error) goto del_cursor; @@ -6222,13 +6220,11 @@ xfs_bmap_shift_extents( *next_fsb = NULLFSBLOCK; break; } - gotp = xfs_iext_get_ext(ifp, current_ext); + xfs_iext_get_extent(ifp, current_ext, &got); } - if (!*done) { - xfs_bmbt_get_all(gotp, &got); + if (!*done) *next_fsb = got.br_startoff; - } del_cursor: if (cur) From 4c35445b591ee669097c5b98e4bb677808e9f582 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 29 Aug 2017 15:44:13 -0700 Subject: [PATCH 33/44] xfs: use xfs_iext_*_extent helpers in xfs_bmap_split_extent_at This abstracts the function away from details of the low-level extent list implementation. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap.c | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 88beac28258ee..9b877024c804d 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -6253,7 +6253,6 @@ xfs_bmap_split_extent_at( { int whichfork = XFS_DATA_FORK; struct xfs_btree_cur *cur = NULL; - struct xfs_bmbt_rec_host *gotp; struct xfs_bmbt_irec got; struct xfs_bmbt_irec new; /* split extent */ struct xfs_mount *mp = ip->i_mount; @@ -6285,21 +6284,10 @@ xfs_bmap_split_extent_at( } /* - * gotp can be null in 2 cases: 1) if there are no extents - * or 2) split_fsb lies in a hole beyond which there are - * no extents. Either way, we are done. + * If there are not extents, or split_fsb lies in a hole we are done. */ - gotp = xfs_iext_bno_to_ext(ifp, split_fsb, ¤t_ext); - if (!gotp) - return 0; - - xfs_bmbt_get_all(gotp, &got); - - /* - * Check split_fsb lies in a hole or the start boundary offset - * of the extent. - */ - if (got.br_startoff >= split_fsb) + if (!xfs_iext_lookup_extent(ip, ifp, split_fsb, ¤t_ext, &got) || + got.br_startoff >= split_fsb) return 0; gotblkcnt = split_fsb - got.br_startoff; @@ -6322,8 +6310,8 @@ xfs_bmap_split_extent_at( XFS_WANT_CORRUPTED_GOTO(mp, i == 1, del_cursor); } - xfs_bmbt_set_blockcount(gotp, gotblkcnt); got.br_blockcount = gotblkcnt; + xfs_iext_update_extent(ifp, current_ext, &got); logflags = XFS_ILOG_CORE; if (cur) { From e17a5c6f0e3609da83270f42698b1dfedde86f44 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 29 Aug 2017 15:44:14 -0700 Subject: [PATCH 34/44] xfs: rewrite xfs_bmap_count_leaves using xfs_iext_get_extent This avoids poking into the internals of the extent list. Also return the number of extents as the return value instead of an additional by reference argument, and make it available to callers outside of xfs_bmap_util.c Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_bmap_util.c | 21 ++++++++++----------- fs/xfs/xfs_bmap_util.h | 1 + 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 8661be0aacaae..cd9a5400ba4fe 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -222,22 +222,21 @@ xfs_bmap_eof( * Count leaf blocks given a range of extent records. Delayed allocation * extents are not counted towards the totals. */ -STATIC void +xfs_extnum_t xfs_bmap_count_leaves( struct xfs_ifork *ifp, - xfs_extnum_t *numrecs, xfs_filblks_t *count) { - xfs_extnum_t i; - xfs_extnum_t nr_exts = xfs_iext_count(ifp); - - for (i = 0; i < nr_exts; i++) { - xfs_bmbt_rec_host_t *frp = xfs_iext_get_ext(ifp, i); - if (!isnullstartblock(xfs_bmbt_get_startblock(frp))) { - (*numrecs)++; - *count += xfs_bmbt_get_blockcount(frp); + struct xfs_bmbt_irec got; + xfs_extnum_t numrecs = 0, i = 0; + + while (xfs_iext_get_extent(ifp, i++, &got)) { + if (!isnullstartblock(got.br_startblock)) { + *count += got.br_blockcount; + numrecs++; } } + return numrecs; } /* @@ -370,7 +369,7 @@ xfs_bmap_count_blocks( switch (XFS_IFORK_FORMAT(ip, whichfork)) { case XFS_DINODE_FMT_EXTENTS: - xfs_bmap_count_leaves(ifp, nextents, count); + *nextents = xfs_bmap_count_leaves(ifp, count); return 0; case XFS_DINODE_FMT_BTREE: if (!(ifp->if_flags & XFS_IFEXTENTS)) { diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index 0cede10435717..0eaa81dc49be6 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -70,6 +70,7 @@ int xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip, xfs_daddr_t xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb); +xfs_extnum_t xfs_bmap_count_leaves(struct xfs_ifork *ifp, xfs_filblks_t *count); int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, xfs_extnum_t *nextents, xfs_filblks_t *count); From 8bfadd8d03b7ff8fe70be3d3a6980e6a20cb5a92 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 29 Aug 2017 15:44:14 -0700 Subject: [PATCH 35/44] xfs: replace xfs_qm_get_rtblks with a direct call to xfs_bmap_count_leaves Use the existing functionality instead of directly poking into the extent list. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_qm.c | 44 ++++++++++++-------------------------------- 1 file changed, 12 insertions(+), 32 deletions(-) diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 15751dc2a27df..010a13a201aad 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -31,6 +31,7 @@ #include "xfs_error.h" #include "xfs_bmap.h" #include "xfs_bmap_btree.h" +#include "xfs_bmap_util.h" #include "xfs_trans.h" #include "xfs_trans_space.h" #include "xfs_qm.h" @@ -1120,31 +1121,6 @@ xfs_qm_quotacheck_dqadjust( return 0; } -STATIC int -xfs_qm_get_rtblks( - xfs_inode_t *ip, - xfs_qcnt_t *O_rtblks) -{ - xfs_filblks_t rtblks; /* total rt blks */ - xfs_extnum_t idx; /* extent record index */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_extnum_t nextents; /* number of extent entries */ - int error; - - ASSERT(XFS_IS_REALTIME_INODE(ip)); - ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); - if (!(ifp->if_flags & XFS_IFEXTENTS)) { - if ((error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK))) - return error; - } - rtblks = 0; - nextents = xfs_iext_count(ifp); - for (idx = 0; idx < nextents; idx++) - rtblks += xfs_bmbt_get_blockcount(xfs_iext_get_ext(ifp, idx)); - *O_rtblks = (xfs_qcnt_t)rtblks; - return 0; -} - /* * callback routine supplied to bulkstat(). Given an inumber, find its * dquots and update them to account for resources taken by that inode. @@ -1160,7 +1136,8 @@ xfs_qm_dqusage_adjust( int *res) /* result code value */ { xfs_inode_t *ip; - xfs_qcnt_t nblks, rtblks = 0; + xfs_qcnt_t nblks; + xfs_filblks_t rtblks = 0; /* total rt blks */ int error; ASSERT(XFS_IS_QUOTA_RUNNING(mp)); @@ -1190,12 +1167,15 @@ xfs_qm_dqusage_adjust( ASSERT(ip->i_delayed_blks == 0); if (XFS_IS_REALTIME_INODE(ip)) { - /* - * Walk thru the extent list and count the realtime blocks. - */ - error = xfs_qm_get_rtblks(ip, &rtblks); - if (error) - goto error0; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); + if (error) + goto error0; + } + + xfs_bmap_count_leaves(ifp, &rtblks); } nblks = (xfs_qcnt_t)ip->i_d.di_nblocks - rtblks; From 742d84290739ae908f1b61b7d17ea382c8c0073a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 30 Aug 2017 09:23:01 -0700 Subject: [PATCH 36/44] xfs: disable per-inode DAX flag Currently flag switching can be used to easily crash the kernel. Disable the per-inode DAX flag until that is sorted out. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_ioctl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 9c0c7a920304e..06ca244c323ae 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1008,11 +1008,12 @@ xfs_diflags_to_linux( inode->i_flags |= S_NOATIME; else inode->i_flags &= ~S_NOATIME; +#if 0 /* disabled until the flag switching races are sorted out */ if (xflags & FS_XFLAG_DAX) inode->i_flags |= S_DAX; else inode->i_flags &= ~S_DAX; - +#endif } static int From 47c7d0b19502583120c3f396c7559e7a77288a68 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 30 Aug 2017 09:23:12 -0700 Subject: [PATCH 37/44] xfs: fix incorrect log_flushed on fsync When calling into _xfs_log_force{,_lsn}() with a pointer to log_flushed variable, log_flushed will be set to 1 if: 1. xlog_sync() is called to flush the active log buffer AND/OR 2. xlog_wait() is called to wait on a syncing log buffers xfs_file_fsync() checks the value of log_flushed after _xfs_log_force_lsn() call to optimize away an explicit PREFLUSH request to the data block device after writing out all the file's pages to disk. This optimization is incorrect in the following sequence of events: Task A Task B ------------------------------------------------------- xfs_file_fsync() _xfs_log_force_lsn() xlog_sync() [submit PREFLUSH] xfs_file_fsync() file_write_and_wait_range() [submit WRITE X] [endio WRITE X] _xfs_log_force_lsn() xlog_wait() [endio PREFLUSH] The write X is not guarantied to be on persistent storage when PREFLUSH request in completed, because write A was submitted after the PREFLUSH request, but xfs_file_fsync() of task A will be notified of log_flushed=1 and will skip explicit flush. If the system crashes after fsync of task A, write X may not be present on disk after reboot. This bug was discovered and demonstrated using Josef Bacik's dm-log-writes target, which can be used to record block io operations and then replay a subset of these operations onto the target device. The test goes something like this: - Use fsx to execute ops of a file and record ops on log device - Every now and then fsync the file, store md5 of file and mark the location in the log - Then replay log onto device for each mark, mount fs and compare md5 of file to stored value Cc: Christoph Hellwig Cc: Josef Bacik Cc: Signed-off-by: Amir Goldstein Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_log.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index bcb2f860e508d..c5107c7bc4bf8 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -3375,8 +3375,6 @@ _xfs_log_force( */ if (iclog->ic_state & XLOG_STATE_IOERROR) return -EIO; - if (log_flushed) - *log_flushed = 1; } else { no_sleep: @@ -3480,8 +3478,6 @@ _xfs_log_force_lsn( xlog_wait(&iclog->ic_prev->ic_write_wait, &log->l_icloglock); - if (log_flushed) - *log_flushed = 1; already_slept = 1; goto try_again; } @@ -3515,9 +3511,6 @@ _xfs_log_force_lsn( */ if (iclog->ic_state & XLOG_STATE_IOERROR) return -EIO; - - if (log_flushed) - *log_flushed = 1; } else { /* just return */ spin_unlock(&log->l_icloglock); } From f91fb956f243086c7a95c508f01152c74c35f6ce Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Wed, 30 Aug 2017 09:24:12 -0700 Subject: [PATCH 38/44] xfs: remove unused flags arg from xfs_file_iomap_begin_delay Signed-off-by: Eric Sandeen Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_iomap.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 1b625d0504419..79cb5b3d140c5 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -520,7 +520,6 @@ xfs_file_iomap_begin_delay( struct inode *inode, loff_t offset, loff_t count, - unsigned flags, struct iomap *iomap) { struct xfs_inode *ip = XFS_I(inode); @@ -984,8 +983,7 @@ xfs_file_iomap_begin( if (((flags & (IOMAP_WRITE | IOMAP_DIRECT)) == IOMAP_WRITE) && !IS_DAX(inode) && !xfs_get_extsz_hint(ip)) { /* Reserve delalloc blocks for regular writeback. */ - return xfs_file_iomap_begin_delay(inode, offset, length, flags, - iomap); + return xfs_file_iomap_begin_delay(inode, offset, length, iomap); } if (need_excl_ilock(ip, flags)) { From 4cc1ee5e654114aa7fac6993488ad2cd0b3411bb Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 30 Aug 2017 16:06:36 -0700 Subject: [PATCH 39/44] xfs: simplify the rmap code in xfs_bmse_merge In Christoph's patch to refactor xfs_bmse_merge, the updated rmap code does more work than it needs to (because map-extent auto-merges records). Remove the unnecessary unmap and save ourselves a deferred op. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_bmap.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 9b877024c804d..9558f5ee1bf96 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -5942,13 +5942,12 @@ xfs_bmse_merge( xfs_iext_update_extent(ifp, current_ext - 1, &new); xfs_iext_remove(ip, current_ext, 1, 0); - /* update reverse mapping */ + /* update reverse mapping. rmap functions merge the rmaps for us */ error = xfs_rmap_unmap_extent(mp, dfops, ip, whichfork, got); if (error) return error; - error = xfs_rmap_unmap_extent(mp, dfops, ip, whichfork, left); - if (error) - return error; + memcpy(&new, got, sizeof(new)); + new.br_startoff = left->br_startoff + left->br_blockcount; return xfs_rmap_map_extent(mp, dfops, ip, whichfork, &new); } From d897246df9fc0a5df97a784bf7b072be4a6ae479 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 31 Aug 2017 13:47:35 -0700 Subject: [PATCH 40/44] fsmap: fix documentation of FMR_OF_LAST The FMR_OF_LAST flag is set on the last fsmap record being returned for the dataset requested, contrary to what the header file says. Fix the docs to reflect the behavior of all fsmap implementations. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- include/uapi/linux/fsmap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/fsmap.h b/include/uapi/linux/fsmap.h index 7e8e5f0bd6d26..e5213c3e38b21 100644 --- a/include/uapi/linux/fsmap.h +++ b/include/uapi/linux/fsmap.h @@ -96,7 +96,7 @@ fsmap_advance( #define FMR_OF_EXTENT_MAP 0x4 /* segment = extent map */ #define FMR_OF_SHARED 0x8 /* segment = shared with another file */ #define FMR_OF_SPECIAL_OWNER 0x10 /* owner is a special value */ -#define FMR_OF_LAST 0x20 /* segment is the last in the FS */ +#define FMR_OF_LAST 0x20 /* segment is the last in the dataset */ /* Each FS gets to define its own special owner codes. */ #define FMR_OWNER(type, code) (((__u64)type << 32) | \ From 7bf7a193a90cadccaad21c5970435c665c40fe27 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 31 Aug 2017 15:11:06 -0700 Subject: [PATCH 41/44] xfs: fix compiler warnings Fix up all the compiler warnings that have crept in. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_bmap.c | 2 +- fs/xfs/libxfs/xfs_inode_fork.c | 9 +++------ fs/xfs/xfs_buf_item.c | 2 ++ fs/xfs/xfs_iops.c | 2 +- fs/xfs/xfs_log_recover.c | 4 ++++ 5 files changed, 11 insertions(+), 8 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 9558f5ee1bf96..459f4b4f08fe5 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -579,7 +579,7 @@ xfs_bmap_validate_ret( #else #define xfs_bmap_check_leaf_extents(cur, ip, whichfork) do { } while (0) -#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap) +#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap) do { } while (0) #endif /* DEBUG */ /* diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index fb310d08dc821..31840ca24018e 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -1499,14 +1499,11 @@ xfs_iext_realloc_indirect( xfs_ifork_t *ifp, /* inode fork pointer */ int new_size) /* new indirection array size */ { - int nlists; /* number of irec's (ex lists) */ - int size; /* current indirection array size */ - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - size = nlists * sizeof(xfs_ext_irec_t); ASSERT(ifp->if_real_bytes); - ASSERT((new_size >= 0) && (new_size != size)); + ASSERT((new_size >= 0) && + (new_size != ((ifp->if_real_bytes / XFS_IEXT_BUFSZ) * + sizeof(xfs_ext_irec_t)))); if (new_size == 0) { xfs_iext_destroy(ifp); } else { diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index ef2c1375f0920..e0a0af0946f23 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -570,7 +570,9 @@ xfs_buf_item_unlock( bool aborted = !!(lip->li_flags & XFS_LI_ABORTED); bool hold = !!(bip->bli_flags & XFS_BLI_HOLD); bool dirty = !!(bip->bli_flags & XFS_BLI_DIRTY); +#if defined(DEBUG) || defined(XFS_WARN) bool ordered = !!(bip->bli_flags & XFS_BLI_ORDERED); +#endif /* Clear the buffer's association with this transaction. */ bp->b_transp = NULL; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 469c9fa4c178d..17081c77ef86e 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -817,7 +817,7 @@ xfs_vn_setattr_nonsize( * Caution: The caller of this function is responsible for calling * setattr_prepare() or otherwise verifying the change is fine. */ -int +STATIC int xfs_setattr_size( struct xfs_inode *ip, struct iattr *iattr) diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index a36239980cf71..ee34899396b26 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -4827,12 +4827,16 @@ xlog_recover_process_intents( int error = 0; struct xfs_ail_cursor cur; struct xfs_ail *ailp; +#if defined(DEBUG) || defined(XFS_WARN) xfs_lsn_t last_lsn; +#endif ailp = log->l_ailp; spin_lock(&ailp->xa_lock); lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); +#if defined(DEBUG) || defined(XFS_WARN) last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block); +#endif while (lip != NULL) { /* * We're done when we see something other than an intent. From dd60687ee541ca3f6df8758f38e6f22f57c42a37 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 2 Sep 2017 08:21:20 -0700 Subject: [PATCH 42/44] xfs: don't set v3 xflags for v2 inodes Reject attempts to set XFLAGS that correspond to di_flags2 inode flags if the inode isn't a v3 inode, because di_flags2 only exists on v3. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_ioctl.c | 38 +++++++++++++++++++++++++------------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 06ca244c323ae..5049e8ab6e302 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -931,16 +931,15 @@ xfs_ioc_fsgetxattr( return 0; } -STATIC void -xfs_set_diflags( +STATIC uint16_t +xfs_flags2diflags( struct xfs_inode *ip, unsigned int xflags) { - unsigned int di_flags; - uint64_t di_flags2; - /* can't set PREALLOC this way, just preserve it */ - di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC); + uint16_t di_flags = + (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC); + if (xflags & FS_XFLAG_IMMUTABLE) di_flags |= XFS_DIFLAG_IMMUTABLE; if (xflags & FS_XFLAG_APPEND) @@ -970,19 +969,24 @@ xfs_set_diflags( if (xflags & FS_XFLAG_EXTSIZE) di_flags |= XFS_DIFLAG_EXTSIZE; } - ip->i_d.di_flags = di_flags; - /* diflags2 only valid for v3 inodes. */ - if (ip->i_d.di_version < 3) - return; + return di_flags; +} + +STATIC uint64_t +xfs_flags2diflags2( + struct xfs_inode *ip, + unsigned int xflags) +{ + uint64_t di_flags2 = + (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK); - di_flags2 = (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK); if (xflags & FS_XFLAG_DAX) di_flags2 |= XFS_DIFLAG2_DAX; if (xflags & FS_XFLAG_COWEXTSIZE) di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; - ip->i_d.di_flags2 = di_flags2; + return di_flags2; } STATIC void @@ -1023,6 +1027,7 @@ xfs_ioctl_setattr_xflags( struct fsxattr *fa) { struct xfs_mount *mp = ip->i_mount; + uint64_t di_flags2; /* Can't change realtime flag if any extents are allocated. */ if ((ip->i_d.di_nextents || ip->i_delayed_blks) && @@ -1053,7 +1058,14 @@ xfs_ioctl_setattr_xflags( !capable(CAP_LINUX_IMMUTABLE)) return -EPERM; - xfs_set_diflags(ip, fa->fsx_xflags); + /* diflags2 only valid for v3 inodes. */ + di_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags); + if (di_flags2 && ip->i_d.di_version < 3) + return -EINVAL; + + ip->i_d.di_flags = xfs_flags2diflags(ip, fa->fsx_xflags); + ip->i_d.di_flags2 = di_flags2; + xfs_diflags_to_linux(ip); xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); From 8353a814f2518dcfa79a5bb77afd0e7dfa391bb1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 2 Sep 2017 09:53:41 -0700 Subject: [PATCH 43/44] xfs: open code end_buffer_async_write in xfs_finish_page_writeback Our loop in xfs_finish_page_writeback, which iterates over all buffer heads in a page and then calls end_buffer_async_write, which also iterates over all buffers in the page to check if any I/O is in flight is not only inefficient, but also potentially dangerous as end_buffer_async_write can cause the page and all buffers to be freed. Replace it with a single loop that does the work of end_buffer_async_write on a per-page basis. Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_aops.c | 71 +++++++++++++++++++++++++++++++---------------- 1 file changed, 47 insertions(+), 24 deletions(-) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 6bf120bb1a17d..f9efd67f6fa10 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -85,11 +85,11 @@ xfs_find_bdev_for_inode( * associated buffer_heads, paying attention to the start and end offsets that * we need to process on the page. * - * Landmine Warning: bh->b_end_io() will call end_page_writeback() on the last - * buffer in the IO. Once it does this, it is unsafe to access the bufferhead or - * the page at all, as we may be racing with memory reclaim and it can free both - * the bufferhead chain and the page as it will see the page as clean and - * unused. + * Note that we open code the action in end_buffer_async_write here so that we + * only have to iterate over the buffers attached to the page once. This is not + * only more efficient, but also ensures that we only calls end_page_writeback + * at the end of the iteration, and thus avoids the pitfall of having the page + * and buffers potentially freed after every call to end_buffer_async_write. */ static void xfs_finish_page_writeback( @@ -97,29 +97,44 @@ xfs_finish_page_writeback( struct bio_vec *bvec, int error) { - unsigned int end = bvec->bv_offset + bvec->bv_len - 1; - struct buffer_head *head, *bh, *next; + struct buffer_head *head = page_buffers(bvec->bv_page), *bh = head; + bool busy = false; unsigned int off = 0; - unsigned int bsize; + unsigned long flags; ASSERT(bvec->bv_offset < PAGE_SIZE); ASSERT((bvec->bv_offset & (i_blocksize(inode) - 1)) == 0); - ASSERT(end < PAGE_SIZE); + ASSERT(bvec->bv_offset + bvec->bv_len <= PAGE_SIZE); ASSERT((bvec->bv_len & (i_blocksize(inode) - 1)) == 0); - bh = head = page_buffers(bvec->bv_page); - - bsize = bh->b_size; + local_irq_save(flags); + bit_spin_lock(BH_Uptodate_Lock, &head->b_state); do { - if (off > end) - break; - next = bh->b_this_page; - if (off < bvec->bv_offset) - goto next_bh; - bh->b_end_io(bh, !error); -next_bh: - off += bsize; - } while ((bh = next) != head); + if (off >= bvec->bv_offset && + off < bvec->bv_offset + bvec->bv_len) { + ASSERT(buffer_async_write(bh)); + ASSERT(bh->b_end_io == NULL); + + if (error) { + mark_buffer_write_io_error(bh); + clear_buffer_uptodate(bh); + SetPageError(bvec->bv_page); + } else { + set_buffer_uptodate(bh); + } + clear_buffer_async_write(bh); + unlock_buffer(bh); + } else if (buffer_async_write(bh)) { + ASSERT(buffer_locked(bh)); + busy = true; + } + off += bh->b_size; + } while ((bh = bh->b_this_page) != head); + bit_spin_unlock(BH_Uptodate_Lock, &head->b_state); + local_irq_restore(flags); + + if (!busy) + end_page_writeback(bvec->bv_page); } /* @@ -133,8 +148,10 @@ xfs_destroy_ioend( int error) { struct inode *inode = ioend->io_inode; - struct bio *last = ioend->io_bio; - struct bio *bio, *next; + struct bio *bio = &ioend->io_inline_bio; + struct bio *last = ioend->io_bio, *next; + u64 start = bio->bi_iter.bi_sector; + bool quiet = bio_flagged(bio, BIO_QUIET); for (bio = &ioend->io_inline_bio; bio; bio = next) { struct bio_vec *bvec; @@ -155,6 +172,11 @@ xfs_destroy_ioend( bio_put(bio); } + + if (unlikely(error && !quiet)) { + xfs_err_ratelimited(XFS_I(inode)->i_mount, + "writeback error on sector %llu", start); + } } /* @@ -423,7 +445,8 @@ xfs_start_buffer_writeback( ASSERT(!buffer_delay(bh)); ASSERT(!buffer_unwritten(bh)); - mark_buffer_async_write(bh); + bh->b_end_io = NULL; + set_buffer_async_write(bh); set_buffer_uptodate(bh); clear_buffer_dirty(bh); } From 6c370590cfe0c36bcd62d548148aa65c984540b7 Mon Sep 17 00:00:00 2001 From: Pan Bian Date: Sat, 2 Sep 2017 09:54:23 -0700 Subject: [PATCH 44/44] xfs: use kmem_free to free return value of kmem_zalloc In function xfs_test_remount_options(), kfree() is used to free memory allocated by kmem_zalloc(). But it is better to use kmem_free(). Signed-off-by: Pan Bian Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 38aaacdbb8b33..c1c4c2ea1014a 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1220,7 +1220,7 @@ xfs_test_remount_options( tmp_mp->m_super = sb; error = xfs_parseargs(tmp_mp, options); xfs_free_fsname(tmp_mp); - kfree(tmp_mp); + kmem_free(tmp_mp); return error; }