Skip to content

Commit

Permalink
xfs: on-stack delayed write buffer lists
Browse files Browse the repository at this point in the history
Queue delwri buffers on a local on-stack list instead of a per-buftarg one,
and write back the buffers per-process instead of by waking up xfsbufd.

This is now easily doable given that we have very few places left that write
delwri buffers:

 - log recovery:
	Only done at mount time, and already forcing out the buffers
	synchronously using xfs_flush_buftarg

 - quotacheck:
	Same story.

 - dquot reclaim:
	Writes out dirty dquots on the LRU under memory pressure.  We might
	want to look into doing more of this via xfsaild, but it's already
	more optimal than the synchronous inode reclaim that writes each
	buffer synchronously.

 - xfsaild:
	This is the main beneficiary of the change.  By keeping a local list
	of buffers to write we reduce latency of writing out buffers, and
	more importably we can remove all the delwri list promotions which
	were hitting the buffer cache hard under sustained metadata loads.

The implementation is very straight forward - xfs_buf_delwri_queue now gets
a new list_head pointer that it adds the delwri buffers to, and all callers
need to eventually submit the list using xfs_buf_delwi_submit or
xfs_buf_delwi_submit_nowait.  Buffers that already are on a delwri list are
skipped in xfs_buf_delwri_queue, assuming they already are on another delwri
list.  The biggest change to pass down the buffer list was done to the AIL
pushing. Now that we operate on buffers the trylock, push and pushbuf log
item methods are merged into a single push routine, which tries to lock the
item, and if possible add the buffer that needs writeback to the buffer list.
This leads to much simpler code than the previous split but requires the
individual IOP_PUSH instances to unlock and reacquire the AIL around calls
to blocking routines.

Given that xfsailds now also handle writing out buffers, the conditions for
log forcing and the sleep times needed some small changes.  The most
important one is that we consider an AIL busy as long we still have buffers
to push, and the other one is that we do increment the pushed LSN for
buffers that are under flushing at this moment, but still count them towards
the stuck items for restart purposes.  Without this we could hammer on stuck
items without ever forcing the log and not make progress under heavy random
delete workloads on fast flash storage devices.

[ Dave Chinner:
	- rebase on previous patches.
	- improved comments for XBF_DELWRI_Q handling
	- fix XBF_ASYNC handling in queue submission (test 106 failure)
	- rename delwri submit function buffer list parameters for clarity
	- xfs_efd_item_push() should return XFS_ITEM_PINNED ]

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
  • Loading branch information
Christoph Hellwig authored and Ben Myers committed May 14, 2012
1 parent 960c60a commit 43ff212
Show file tree
Hide file tree
Showing 19 changed files with 442 additions and 918 deletions.
341 changes: 131 additions & 210 deletions fs/xfs/xfs_buf.c

Large diffs are not rendered by default.

28 changes: 6 additions & 22 deletions fs/xfs/xfs_buf.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,7 @@ typedef enum {
#define XBF_MAPPED (1 << 3) /* buffer mapped (b_addr valid) */
#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */
#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */
#define XBF_DELWRI (1 << 6) /* buffer has dirty pages */
#define XBF_STALE (1 << 7) /* buffer has been staled, do not find it */
#define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */

/* I/O hints for the BIO layer */
#define XBF_SYNCIO (1 << 10)/* treat this buffer as synchronous I/O */
Expand All @@ -65,7 +64,7 @@ typedef enum {
/* flags used only internally */
#define _XBF_PAGES (1 << 20)/* backed by refcounted pages */
#define _XBF_KMEM (1 << 21)/* backed by heap memory */
#define _XBF_DELWRI_Q (1 << 22)/* buffer on delwri queue */
#define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */

typedef unsigned int xfs_buf_flags_t;

Expand All @@ -76,7 +75,6 @@ typedef unsigned int xfs_buf_flags_t;
{ XBF_MAPPED, "MAPPED" }, \
{ XBF_ASYNC, "ASYNC" }, \
{ XBF_DONE, "DONE" }, \
{ XBF_DELWRI, "DELWRI" }, \
{ XBF_STALE, "STALE" }, \
{ XBF_SYNCIO, "SYNCIO" }, \
{ XBF_FUA, "FUA" }, \
Expand All @@ -88,10 +86,6 @@ typedef unsigned int xfs_buf_flags_t;
{ _XBF_KMEM, "KMEM" }, \
{ _XBF_DELWRI_Q, "DELWRI_Q" }

typedef enum {
XBT_FORCE_FLUSH = 0,
} xfs_buftarg_flags_t;

typedef struct xfs_buftarg {
dev_t bt_dev;
struct block_device *bt_bdev;
Expand All @@ -101,12 +95,6 @@ typedef struct xfs_buftarg {
unsigned int bt_sshift;
size_t bt_smask;

/* per device delwri queue */
struct task_struct *bt_task;
struct list_head bt_delwri_queue;
spinlock_t bt_delwri_lock;
unsigned long bt_flags;

/* LRU control structures */
struct shrinker bt_shrinker;
struct list_head bt_lru;
Expand Down Expand Up @@ -150,7 +138,6 @@ typedef struct xfs_buf {
struct xfs_trans *b_transp;
struct page **b_pages; /* array of page pointers */
struct page *b_page_array[XB_PAGES]; /* inline pages */
unsigned long b_queuetime; /* time buffer was queued */
atomic_t b_pin_count; /* pin count */
atomic_t b_io_remaining; /* #outstanding I/O requests */
unsigned int b_page_count; /* size of page array */
Expand Down Expand Up @@ -220,24 +207,22 @@ static inline int xfs_buf_geterror(xfs_buf_t *bp)
extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t);

/* Delayed Write Buffer Routines */
extern void xfs_buf_delwri_queue(struct xfs_buf *);
extern void xfs_buf_delwri_dequeue(struct xfs_buf *);
extern void xfs_buf_delwri_promote(struct xfs_buf *);
extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *);
extern int xfs_buf_delwri_submit(struct list_head *);
extern int xfs_buf_delwri_submit_nowait(struct list_head *);

/* Buffer Daemon Setup Routines */
extern int xfs_buf_init(void);
extern void xfs_buf_terminate(void);

#define XFS_BUF_ZEROFLAGS(bp) \
((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI| \
((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC| \
XBF_SYNCIO|XBF_FUA|XBF_FLUSH))

void xfs_buf_stale(struct xfs_buf *bp);
#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE)
#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE)

#define XFS_BUF_ISDELAYWRITE(bp) ((bp)->b_flags & XBF_DELWRI)

#define XFS_BUF_DONE(bp) ((bp)->b_flags |= XBF_DONE)
#define XFS_BUF_UNDONE(bp) ((bp)->b_flags &= ~XBF_DONE)
#define XFS_BUF_ISDONE(bp) ((bp)->b_flags & XBF_DONE)
Expand Down Expand Up @@ -287,7 +272,6 @@ extern xfs_buftarg_t *xfs_alloc_buftarg(struct xfs_mount *,
extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *);
extern void xfs_wait_buftarg(xfs_buftarg_t *);
extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
extern int xfs_flush_buftarg(xfs_buftarg_t *, int);

#define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev)
#define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev)
Expand Down
96 changes: 26 additions & 70 deletions fs/xfs/xfs_buf_item.c
Original file line number Diff line number Diff line change
Expand Up @@ -418,7 +418,6 @@ xfs_buf_item_unpin(
if (freed && stale) {
ASSERT(bip->bli_flags & XFS_BLI_STALE);
ASSERT(xfs_buf_islocked(bp));
ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
ASSERT(XFS_BUF_ISSTALE(bp));
ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);

Expand Down Expand Up @@ -469,34 +468,28 @@ xfs_buf_item_unpin(
}
}

/*
* This is called to attempt to lock the buffer associated with this
* buf log item. Don't sleep on the buffer lock. If we can't get
* the lock right away, return 0. If we can get the lock, take a
* reference to the buffer. If this is a delayed write buffer that
* needs AIL help to be written back, invoke the pushbuf routine
* rather than the normal success path.
*/
STATIC uint
xfs_buf_item_trylock(
struct xfs_log_item *lip)
xfs_buf_item_push(
struct xfs_log_item *lip,
struct list_head *buffer_list)
{
struct xfs_buf_log_item *bip = BUF_ITEM(lip);
struct xfs_buf *bp = bip->bli_buf;
uint rval = XFS_ITEM_SUCCESS;

if (xfs_buf_ispinned(bp))
return XFS_ITEM_PINNED;
if (!xfs_buf_trylock(bp))
return XFS_ITEM_LOCKED;

/* take a reference to the buffer. */
xfs_buf_hold(bp);

ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
trace_xfs_buf_item_trylock(bip);
if (XFS_BUF_ISDELAYWRITE(bp))
return XFS_ITEM_PUSHBUF;
return XFS_ITEM_SUCCESS;

trace_xfs_buf_item_push(bip);

if (!xfs_buf_delwri_queue(bp, buffer_list))
rval = XFS_ITEM_FLUSHING;
xfs_buf_unlock(bp);
return rval;
}

/*
Expand Down Expand Up @@ -609,48 +602,6 @@ xfs_buf_item_committed(
return lsn;
}

/*
* The buffer is locked, but is not a delayed write buffer.
*/
STATIC void
xfs_buf_item_push(
struct xfs_log_item *lip)
{
struct xfs_buf_log_item *bip = BUF_ITEM(lip);
struct xfs_buf *bp = bip->bli_buf;

ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
ASSERT(!XFS_BUF_ISDELAYWRITE(bp));

trace_xfs_buf_item_push(bip);

xfs_buf_delwri_queue(bp);
xfs_buf_relse(bp);
}

/*
* The buffer is locked and is a delayed write buffer. Promote the buffer
* in the delayed write queue as the caller knows that they must invoke
* the xfsbufd to get this buffer written. We have to unlock the buffer
* to allow the xfsbufd to write it, too.
*/
STATIC bool
xfs_buf_item_pushbuf(
struct xfs_log_item *lip)
{
struct xfs_buf_log_item *bip = BUF_ITEM(lip);
struct xfs_buf *bp = bip->bli_buf;

ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
ASSERT(XFS_BUF_ISDELAYWRITE(bp));

trace_xfs_buf_item_pushbuf(bip);

xfs_buf_delwri_promote(bp);
xfs_buf_relse(bp);
return true;
}

STATIC void
xfs_buf_item_committing(
struct xfs_log_item *lip,
Expand All @@ -666,11 +617,9 @@ static const struct xfs_item_ops xfs_buf_item_ops = {
.iop_format = xfs_buf_item_format,
.iop_pin = xfs_buf_item_pin,
.iop_unpin = xfs_buf_item_unpin,
.iop_trylock = xfs_buf_item_trylock,
.iop_unlock = xfs_buf_item_unlock,
.iop_committed = xfs_buf_item_committed,
.iop_push = xfs_buf_item_push,
.iop_pushbuf = xfs_buf_item_pushbuf,
.iop_committing = xfs_buf_item_committing
};

Expand Down Expand Up @@ -989,20 +938,27 @@ xfs_buf_iodone_callbacks(
* If the write was asynchronous then no one will be looking for the
* error. Clear the error state and write the buffer out again.
*
* During sync or umount we'll write all pending buffers again
* synchronous, which will catch these errors if they keep hanging
* around.
* XXX: This helps against transient write errors, but we need to find
* a way to shut the filesystem down if the writes keep failing.
*
* In practice we'll shut the filesystem down soon as non-transient
* erorrs tend to affect the whole device and a failing log write
* will make us give up. But we really ought to do better here.
*/
if (XFS_BUF_ISASYNC(bp)) {
ASSERT(bp->b_iodone != NULL);

trace_xfs_buf_item_iodone_async(bp, _RET_IP_);

xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */

if (!XFS_BUF_ISSTALE(bp)) {
xfs_buf_delwri_queue(bp);
XFS_BUF_DONE(bp);
bp->b_flags |= XBF_WRITE | XBF_ASYNC | XBF_DONE;
xfs_bdstrat_cb(bp);
} else {
xfs_buf_relse(bp);
}
ASSERT(bp->b_iodone != NULL);
trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
xfs_buf_relse(bp);

return;
}

Expand Down
33 changes: 0 additions & 33 deletions fs/xfs/xfs_dquot.c
Original file line number Diff line number Diff line change
Expand Up @@ -1005,39 +1005,6 @@ xfs_dqlock2(
}
}

/*
* Give the buffer a little push if it is incore and
* wait on the flush lock.
*/
void
xfs_dqflock_pushbuf_wait(
xfs_dquot_t *dqp)
{
xfs_mount_t *mp = dqp->q_mount;
xfs_buf_t *bp;

/*
* Check to see if the dquot has been flushed delayed
* write. If so, grab its buffer and send it
* out immediately. We'll be able to acquire
* the flush lock when the I/O completes.
*/
bp = xfs_incore(mp->m_ddev_targp, dqp->q_blkno,
mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK);
if (!bp)
goto out_lock;

if (XFS_BUF_ISDELAYWRITE(bp)) {
if (xfs_buf_ispinned(bp))
xfs_log_force(mp, 0);
xfs_buf_delwri_promote(bp);
wake_up_process(bp->b_target->bt_task);
}
xfs_buf_relse(bp);
out_lock:
xfs_dqflock(dqp);
}

int __init
xfs_qm_init(void)
{
Expand Down
1 change: 0 additions & 1 deletion fs/xfs/xfs_dquot.h
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,6 @@ extern int xfs_qm_dqget(xfs_mount_t *, xfs_inode_t *,
extern void xfs_qm_dqput(xfs_dquot_t *);

extern void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *);
extern void xfs_dqflock_pushbuf_wait(struct xfs_dquot *dqp);

static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp)
{
Expand Down
Loading

0 comments on commit 43ff212

Please sign in to comment.