Skip to content

Commit

Permalink
Merge branch 'xfs-dax-support' into for-next
Browse files Browse the repository at this point in the history
  • Loading branch information
Dave Chinner committed Jun 4, 2015
2 parents b9a350a + cbe4dab commit 66e8ac7
Show file tree
Hide file tree
Showing 12 changed files with 332 additions and 153 deletions.
34 changes: 27 additions & 7 deletions fs/dax.c
Original file line number Diff line number Diff line change
Expand Up @@ -309,14 +309,21 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
out:
i_mmap_unlock_read(mapping);

if (bh->b_end_io)
bh->b_end_io(bh, 1);

return error;
}

static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
get_block_t get_block)
/**
* __dax_fault - handle a page fault on a DAX file
* @vma: The virtual memory area where the fault occurred
* @vmf: The description of the fault
* @get_block: The filesystem method used to translate file offsets to blocks
*
* When a page fault occurs, filesystems may call this helper in their
* fault handler for DAX files. __dax_fault() assumes the caller has done all
* the necessary locking for the page fault to proceed successfully.
*/
int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
get_block_t get_block, dax_iodone_t complete_unwritten)
{
struct file *file = vma->vm_file;
struct address_space *mapping = file->f_mapping;
Expand Down Expand Up @@ -417,7 +424,19 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
page_cache_release(page);
}

/*
* If we successfully insert the new mapping over an unwritten extent,
* we need to ensure we convert the unwritten extent. If there is an
* error inserting the mapping, the filesystem needs to leave it as
* unwritten to prevent exposure of the stale underlying data to
* userspace, but we still need to call the completion function so
* the private resources on the mapping buffer can be released. We
* indicate what the callback should do via the uptodate variable, same
* as for normal BH based IO completions.
*/
error = dax_insert_mapping(inode, &bh, vma, vmf);
if (buffer_unwritten(&bh))
complete_unwritten(&bh, !error);

out:
if (error == -ENOMEM)
Expand All @@ -434,6 +453,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
}
goto out;
}
EXPORT_SYMBOL(__dax_fault);

/**
* dax_fault - handle a page fault on a DAX file
Expand All @@ -445,7 +465,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
* fault handler for DAX files.
*/
int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
get_block_t get_block)
get_block_t get_block, dax_iodone_t complete_unwritten)
{
int result;
struct super_block *sb = file_inode(vma->vm_file)->i_sb;
Expand All @@ -454,7 +474,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
sb_start_pagefault(sb);
file_update_time(vma->vm_file);
}
result = do_dax_fault(vma, vmf, get_block);
result = __dax_fault(vma, vmf, get_block, complete_unwritten);
if (vmf->flags & FAULT_FLAG_WRITE)
sb_end_pagefault(sb);

Expand Down
4 changes: 2 additions & 2 deletions fs/ext2/file.c
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,12 @@
#ifdef CONFIG_FS_DAX
static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
return dax_fault(vma, vmf, ext2_get_block);
return dax_fault(vma, vmf, ext2_get_block, NULL);
}

static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
return dax_mkwrite(vma, vmf, ext2_get_block);
return dax_mkwrite(vma, vmf, ext2_get_block, NULL);
}

static const struct vm_operations_struct ext2_dax_vm_ops = {
Expand Down
16 changes: 14 additions & 2 deletions fs/ext4/file.c
Original file line number Diff line number Diff line change
Expand Up @@ -192,15 +192,27 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
}

#ifdef CONFIG_FS_DAX
static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
{
struct inode *inode = bh->b_assoc_map->host;
/* XXX: breaks on 32-bit > 16GB. Is that even supported? */
loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
int err;
if (!uptodate)
return;
WARN_ON(!buffer_unwritten(bh));
err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
}

static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
return dax_fault(vma, vmf, ext4_get_block);
return dax_fault(vma, vmf, ext4_get_block, ext4_end_io_unwritten);
/* Is this the right get_block? */
}

static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
return dax_mkwrite(vma, vmf, ext4_get_block);
return dax_mkwrite(vma, vmf, ext4_get_block, ext4_end_io_unwritten);
}

static const struct vm_operations_struct ext4_dax_vm_ops = {
Expand Down
21 changes: 7 additions & 14 deletions fs/ext4/inode.c
Original file line number Diff line number Diff line change
Expand Up @@ -656,18 +656,6 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
return retval;
}

static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
{
struct inode *inode = bh->b_assoc_map->host;
/* XXX: breaks on 32-bit > 16GB. Is that even supported? */
loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
int err;
if (!uptodate)
return;
WARN_ON(!buffer_unwritten(bh));
err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
}

/* Maximum number of blocks we map for direct IO at once. */
#define DIO_MAX_BLOCKS 4096

Expand Down Expand Up @@ -705,10 +693,15 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,

map_bh(bh, inode->i_sb, map.m_pblk);
bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) {
if (IS_DAX(inode) && buffer_unwritten(bh)) {
/*
* dgc: I suspect unwritten conversion on ext4+DAX is
* fundamentally broken here when there are concurrent
* read/write in progress on this inode.
*/
WARN_ON_ONCE(io_end);
bh->b_assoc_map = inode->i_mapping;
bh->b_private = (void *)(unsigned long)iblock;
bh->b_end_io = ext4_end_io_unwritten;
}
if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
set_buffer_defer_completion(bh);
Expand Down
152 changes: 110 additions & 42 deletions fs/xfs/xfs_aops.c
Original file line number Diff line number Diff line change
Expand Up @@ -1349,7 +1349,7 @@ __xfs_get_blocks(
sector_t iblock,
struct buffer_head *bh_result,
int create,
int direct)
bool direct)
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
Expand Down Expand Up @@ -1414,6 +1414,7 @@ __xfs_get_blocks(
if (error)
return error;
new = 1;

} else {
/*
* Delalloc reservations do not require a transaction,
Expand Down Expand Up @@ -1508,49 +1509,29 @@ xfs_get_blocks(
struct buffer_head *bh_result,
int create)
{
return __xfs_get_blocks(inode, iblock, bh_result, create, 0);
return __xfs_get_blocks(inode, iblock, bh_result, create, false);
}

STATIC int
int
xfs_get_blocks_direct(
struct inode *inode,
sector_t iblock,
struct buffer_head *bh_result,
int create)
{
return __xfs_get_blocks(inode, iblock, bh_result, create, 1);
return __xfs_get_blocks(inode, iblock, bh_result, create, true);
}

/*
* Complete a direct I/O write request.
*
* The ioend structure is passed from __xfs_get_blocks() to tell us what to do.
* If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite
* wholly within the EOF and so there is nothing for us to do. Note that in this
* case the completion can be called in interrupt context, whereas if we have an
* ioend we will always be called in task context (i.e. from a workqueue).
*/
STATIC void
xfs_end_io_direct_write(
struct kiocb *iocb,
static void
__xfs_end_io_direct_write(
struct inode *inode,
struct xfs_ioend *ioend,
loff_t offset,
ssize_t size,
void *private)
ssize_t size)
{
struct inode *inode = file_inode(iocb->ki_filp);
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
struct xfs_ioend *ioend = private;

trace_xfs_gbmap_direct_endio(ip, offset, size,
ioend ? ioend->io_type : 0, NULL);
struct xfs_mount *mp = XFS_I(inode)->i_mount;

if (!ioend) {
ASSERT(offset + size <= i_size_read(inode));
return;
}

if (XFS_FORCED_SHUTDOWN(mp))
if (XFS_FORCED_SHUTDOWN(mp) || ioend->io_error)
goto out_end_io;

/*
Expand Down Expand Up @@ -1587,10 +1568,10 @@ xfs_end_io_direct_write(
* here can result in EOF moving backwards and Bad Things Happen when
* that occurs.
*/
spin_lock(&ip->i_flags_lock);
spin_lock(&XFS_I(inode)->i_flags_lock);
if (offset + size > i_size_read(inode))
i_size_write(inode, offset + size);
spin_unlock(&ip->i_flags_lock);
spin_unlock(&XFS_I(inode)->i_flags_lock);

/*
* If we are doing an append IO that needs to update the EOF on disk,
Expand All @@ -1607,23 +1588,110 @@ xfs_end_io_direct_write(
return;
}

/*
* Complete a direct I/O write request.
*
* The ioend structure is passed from __xfs_get_blocks() to tell us what to do.
* If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite
* wholly within the EOF and so there is nothing for us to do. Note that in this
* case the completion can be called in interrupt context, whereas if we have an
* ioend we will always be called in task context (i.e. from a workqueue).
*/
STATIC void
xfs_end_io_direct_write(
struct kiocb *iocb,
loff_t offset,
ssize_t size,
void *private)
{
struct inode *inode = file_inode(iocb->ki_filp);
struct xfs_ioend *ioend = private;

trace_xfs_gbmap_direct_endio(XFS_I(inode), offset, size,
ioend ? ioend->io_type : 0, NULL);

if (!ioend) {
ASSERT(offset + size <= i_size_read(inode));
return;
}

__xfs_end_io_direct_write(inode, ioend, offset, size);
}

/*
* For DAX we need a mapping buffer callback for unwritten extent conversion
* when page faults allocate blocks and then zero them. Note that in this
* case the mapping indicated by the ioend may extend beyond EOF. We most
* definitely do not want to extend EOF here, so we trim back the ioend size to
* EOF.
*/
#ifdef CONFIG_FS_DAX
void
xfs_end_io_dax_write(
struct buffer_head *bh,
int uptodate)
{
struct xfs_ioend *ioend = bh->b_private;
struct inode *inode = ioend->io_inode;
ssize_t size = ioend->io_size;

ASSERT(IS_DAX(ioend->io_inode));

/* if there was an error zeroing, then don't convert it */
if (!uptodate)
ioend->io_error = -EIO;

/*
* Trim update to EOF, so we don't extend EOF during unwritten extent
* conversion of partial EOF blocks.
*/
spin_lock(&XFS_I(inode)->i_flags_lock);
if (ioend->io_offset + size > i_size_read(inode))
size = i_size_read(inode) - ioend->io_offset;
spin_unlock(&XFS_I(inode)->i_flags_lock);

__xfs_end_io_direct_write(inode, ioend, ioend->io_offset, size);

}
#else
void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate) { }
#endif

static inline ssize_t
xfs_vm_do_dio(
struct inode *inode,
struct kiocb *iocb,
struct iov_iter *iter,
loff_t offset,
void (*endio)(struct kiocb *iocb,
loff_t offset,
ssize_t size,
void *private),
int flags)
{
struct block_device *bdev;

if (IS_DAX(inode))
return dax_do_io(iocb, inode, iter, offset,
xfs_get_blocks_direct, endio, 0);

bdev = xfs_find_bdev_for_inode(inode);
return __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
xfs_get_blocks_direct, endio, NULL, flags);
}

STATIC ssize_t
xfs_vm_direct_IO(
struct kiocb *iocb,
struct iov_iter *iter,
loff_t offset)
{
struct inode *inode = iocb->ki_filp->f_mapping->host;
struct block_device *bdev = xfs_find_bdev_for_inode(inode);

if (iov_iter_rw(iter) == WRITE) {
return __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
xfs_get_blocks_direct,
xfs_end_io_direct_write, NULL,
DIO_ASYNC_EXTEND);
}
return __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
xfs_get_blocks_direct, NULL, NULL, 0);
if (iov_iter_rw(iter) == WRITE)
return xfs_vm_do_dio(inode, iocb, iter, offset,
xfs_end_io_direct_write, DIO_ASYNC_EXTEND);
return xfs_vm_do_dio(inode, iocb, iter, offset, NULL, 0);
}

/*
Expand Down
7 changes: 6 additions & 1 deletion fs/xfs/xfs_aops.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,12 @@ typedef struct xfs_ioend {
} xfs_ioend_t;

extern const struct address_space_operations xfs_address_space_operations;
extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int);

int xfs_get_blocks(struct inode *inode, sector_t offset,
struct buffer_head *map_bh, int create);
int xfs_get_blocks_direct(struct inode *inode, sector_t offset,
struct buffer_head *map_bh, int create);
void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate);

extern void xfs_count_page_state(struct page *, int *, int *);

Expand Down
Loading

0 comments on commit 66e8ac7

Please sign in to comment.