Skip to content

Commit

Permalink
xfs: implement iomap based buffered write path
Browse files Browse the repository at this point in the history
Convert XFS to use the new iomap based multipage write path. This involves
implementing the ->iomap_begin and ->iomap_end methods, and switching the
buffered file write, page_mkwrite and xfs_iozero paths to the new iomap
helpers.

With this change __xfs_get_blocks will never be used for buffered writes,
and the code handling them can be removed.

Based on earlier code from Dave Chinner.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
  • Loading branch information
Christoph Hellwig authored and Dave Chinner committed Jun 20, 2016
1 parent f0c6bcb commit 68a9f5e
Show file tree
Hide file tree
Showing 7 changed files with 187 additions and 258 deletions.
1 change: 1 addition & 0 deletions fs/xfs/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ config XFS_FS
depends on (64BIT || LBDAF)
select EXPORTFS
select LIBCRC32C
select FS_IOMAP
help
XFS is a high performance journaling filesystem which originated
on the SGI IRIX platform. It is completely multi-threaded, can
Expand Down
212 changes: 0 additions & 212 deletions fs/xfs/xfs_aops.c
Original file line number Diff line number Diff line change
Expand Up @@ -1427,216 +1427,6 @@ xfs_vm_direct_IO(
xfs_get_blocks_direct, endio, NULL, flags);
}

/*
* Punch out the delalloc blocks we have already allocated.
*
* Don't bother with xfs_setattr given that nothing can have made it to disk yet
* as the page is still locked at this point.
*/
STATIC void
xfs_vm_kill_delalloc_range(
struct inode *inode,
loff_t start,
loff_t end)
{
struct xfs_inode *ip = XFS_I(inode);
xfs_fileoff_t start_fsb;
xfs_fileoff_t end_fsb;
int error;

start_fsb = XFS_B_TO_FSB(ip->i_mount, start);
end_fsb = XFS_B_TO_FSB(ip->i_mount, end);
if (end_fsb <= start_fsb)
return;

xfs_ilock(ip, XFS_ILOCK_EXCL);
error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
end_fsb - start_fsb);
if (error) {
/* something screwed, just bail */
if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
xfs_alert(ip->i_mount,
"xfs_vm_write_failed: unable to clean up ino %lld",
ip->i_ino);
}
}
xfs_iunlock(ip, XFS_ILOCK_EXCL);
}

STATIC void
xfs_vm_write_failed(
struct inode *inode,
struct page *page,
loff_t pos,
unsigned len)
{
loff_t block_offset;
loff_t block_start;
loff_t block_end;
loff_t from = pos & (PAGE_SIZE - 1);
loff_t to = from + len;
struct buffer_head *bh, *head;
struct xfs_mount *mp = XFS_I(inode)->i_mount;

/*
* The request pos offset might be 32 or 64 bit, this is all fine
* on 64-bit platform. However, for 64-bit pos request on 32-bit
* platform, the high 32-bit will be masked off if we evaluate the
* block_offset via (pos & PAGE_MASK) because the PAGE_MASK is
* 0xfffff000 as an unsigned long, hence the result is incorrect
* which could cause the following ASSERT failed in most cases.
* In order to avoid this, we can evaluate the block_offset of the
* start of the page by using shifts rather than masks the mismatch
* problem.
*/
block_offset = (pos >> PAGE_SHIFT) << PAGE_SHIFT;

ASSERT(block_offset + from == pos);

head = page_buffers(page);
block_start = 0;
for (bh = head; bh != head || !block_start;
bh = bh->b_this_page, block_start = block_end,
block_offset += bh->b_size) {
block_end = block_start + bh->b_size;

/* skip buffers before the write */
if (block_end <= from)
continue;

/* if the buffer is after the write, we're done */
if (block_start >= to)
break;

/*
* Process delalloc and unwritten buffers beyond EOF. We can
* encounter unwritten buffers in the event that a file has
* post-EOF unwritten extents and an extending write happens to
* fail (e.g., an unaligned write that also involves a delalloc
* to the same page).
*/
if (!buffer_delay(bh) && !buffer_unwritten(bh))
continue;

if (!xfs_mp_fail_writes(mp) && !buffer_new(bh) &&
block_offset < i_size_read(inode))
continue;

if (buffer_delay(bh))
xfs_vm_kill_delalloc_range(inode, block_offset,
block_offset + bh->b_size);

/*
* This buffer does not contain data anymore. make sure anyone
* who finds it knows that for certain.
*/
clear_buffer_delay(bh);
clear_buffer_uptodate(bh);
clear_buffer_mapped(bh);
clear_buffer_new(bh);
clear_buffer_dirty(bh);
clear_buffer_unwritten(bh);
}

}

/*
* This used to call block_write_begin(), but it unlocks and releases the page
* on error, and we need that page to be able to punch stale delalloc blocks out
* on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at
* the appropriate point.
*/
STATIC int
xfs_vm_write_begin(
struct file *file,
struct address_space *mapping,
loff_t pos,
unsigned len,
unsigned flags,
struct page **pagep,
void **fsdata)
{
pgoff_t index = pos >> PAGE_SHIFT;
struct page *page;
int status;
struct xfs_mount *mp = XFS_I(mapping->host)->i_mount;

ASSERT(len <= PAGE_SIZE);

page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
return -ENOMEM;

status = __block_write_begin(page, pos, len, xfs_get_blocks);
if (xfs_mp_fail_writes(mp))
status = -EIO;
if (unlikely(status)) {
struct inode *inode = mapping->host;
size_t isize = i_size_read(inode);

xfs_vm_write_failed(inode, page, pos, len);
unlock_page(page);

/*
* If the write is beyond EOF, we only want to kill blocks
* allocated in this write, not blocks that were previously
* written successfully.
*/
if (xfs_mp_fail_writes(mp))
isize = 0;
if (pos + len > isize) {
ssize_t start = max_t(ssize_t, pos, isize);

truncate_pagecache_range(inode, start, pos + len);
}

put_page(page);
page = NULL;
}

*pagep = page;
return status;
}

/*
* On failure, we only need to kill delalloc blocks beyond EOF in the range of
* this specific write because they will never be written. Previous writes
* beyond EOF where block allocation succeeded do not need to be trashed, so
* only new blocks from this write should be trashed. For blocks within
* EOF, generic_write_end() zeros them so they are safe to leave alone and be
* written with all the other valid data.
*/
STATIC int
xfs_vm_write_end(
struct file *file,
struct address_space *mapping,
loff_t pos,
unsigned len,
unsigned copied,
struct page *page,
void *fsdata)
{
int ret;

ASSERT(len <= PAGE_SIZE);

ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
if (unlikely(ret < len)) {
struct inode *inode = mapping->host;
size_t isize = i_size_read(inode);
loff_t to = pos + len;

if (to > isize) {
/* only kill blocks in this write beyond EOF */
if (pos > isize)
isize = pos;
xfs_vm_kill_delalloc_range(inode, isize, to);
truncate_pagecache_range(inode, isize, to);
}
}
return ret;
}

STATIC sector_t
xfs_vm_bmap(
struct address_space *mapping,
Expand Down Expand Up @@ -1747,8 +1537,6 @@ const struct address_space_operations xfs_address_space_operations = {
.set_page_dirty = xfs_vm_set_page_dirty,
.releasepage = xfs_vm_releasepage,
.invalidatepage = xfs_vm_invalidatepage,
.write_begin = xfs_vm_write_begin,
.write_end = xfs_vm_write_end,
.bmap = xfs_vm_bmap,
.direct_IO = xfs_vm_direct_IO,
.migratepage = buffer_migrate_page,
Expand Down
71 changes: 30 additions & 41 deletions fs/xfs/xfs_file.c
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
#include "xfs_log.h"
#include "xfs_icache.h"
#include "xfs_pnfs.h"
#include "xfs_iomap.h"

#include <linux/dcache.h>
#include <linux/falloc.h>
Expand Down Expand Up @@ -79,64 +80,52 @@ xfs_rw_ilock_demote(
inode_unlock(VFS_I(ip));
}

/*
* xfs_iozero clears the specified range supplied via the page cache (except in
* the DAX case). Writes through the page cache will allocate blocks over holes,
* though the callers usually map the holes first and avoid them. If a block is
* not completely zeroed, then it will be read from disk before being partially
* zeroed.
*
* In the DAX case, we can just directly write to the underlying pages. This
* will not allocate blocks, but will avoid holes and unwritten extents and so
* not do unnecessary work.
*/
int
xfs_iozero(
struct xfs_inode *ip, /* inode */
loff_t pos, /* offset in file */
size_t count) /* size of data to zero */
static int
xfs_dax_zero_range(
struct inode *inode,
loff_t pos,
size_t count)
{
struct page *page;
struct address_space *mapping;
int status = 0;


mapping = VFS_I(ip)->i_mapping;
do {
unsigned offset, bytes;
void *fsdata;

offset = (pos & (PAGE_SIZE -1)); /* Within page */
bytes = PAGE_SIZE - offset;
if (bytes > count)
bytes = count;

if (IS_DAX(VFS_I(ip))) {
status = dax_zero_page_range(VFS_I(ip), pos, bytes,
xfs_get_blocks_direct);
if (status)
break;
} else {
status = pagecache_write_begin(NULL, mapping, pos, bytes,
AOP_FLAG_UNINTERRUPTIBLE,
&page, &fsdata);
if (status)
break;

zero_user(page, offset, bytes);
status = dax_zero_page_range(inode, pos, bytes,
xfs_get_blocks_direct);
if (status)
break;

status = pagecache_write_end(NULL, mapping, pos, bytes,
bytes, page, fsdata);
WARN_ON(status <= 0); /* can't return less than zero! */
status = 0;
}
pos += bytes;
count -= bytes;
} while (count);

return status;
}

/*
* Clear the specified ranges to zero through either the pagecache or DAX.
* Holes and unwritten extents will be left as-is as they already are zeroed.
*/
int
xfs_iozero(
struct xfs_inode *ip,
loff_t pos,
size_t count)
{
struct inode *inode = VFS_I(ip);

if (IS_DAX(VFS_I(ip)))
return xfs_dax_zero_range(inode, pos, count);
else
return iomap_zero_range(inode, pos, count, NULL, &xfs_iomap_ops);
}

int
xfs_update_prealloc_flags(
struct xfs_inode *ip,
Expand Down Expand Up @@ -841,7 +830,7 @@ xfs_file_buffered_aio_write(
write_retry:
trace_xfs_file_buffered_write(ip, iov_iter_count(from),
iocb->ki_pos, 0);
ret = generic_perform_write(file, from, iocb->ki_pos);
ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops);
if (likely(ret >= 0))
iocb->ki_pos += ret;

Expand Down Expand Up @@ -1553,7 +1542,7 @@ xfs_filemap_page_mkwrite(
if (IS_DAX(inode)) {
ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault);
} else {
ret = block_page_mkwrite(vma, vmf, xfs_get_blocks);
ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops);
ret = block_page_mkwrite_return(ret);
}

Expand Down
Loading

0 comments on commit 68a9f5e

Please sign in to comment.