Skip to content

Commit

Permalink
[PATCH] Add vector AIO support
Browse files Browse the repository at this point in the history
This work is initially done by Zach Brown to add support for vectored aio.
These are the core changes for AIO to support
IOCB_CMD_PREADV/IOCB_CMD_PWRITEV.

[akpm@osdl.org: huge build fix]
Signed-off-by: Zach Brown <zach.brown@oracle.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Badari Pulavarty <pbadari@us.ibm.com>
Acked-by: Benjamin LaHaise <bcrl@kvack.org>
Acked-by: James Morris <jmorris@namei.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
  • Loading branch information
Badari Pulavarty authored and Linus Torvalds committed Oct 1, 2006
1 parent 543ade1 commit eed4e51
Show file tree
Hide file tree
Showing 5 changed files with 203 additions and 108 deletions.
171 changes: 117 additions & 54 deletions fs/aio.c
Original file line number Diff line number Diff line change
Expand Up @@ -415,6 +415,7 @@ static struct kiocb fastcall *__aio_get_req(struct kioctx *ctx)
req->ki_retry = NULL;
req->ki_dtor = NULL;
req->private = NULL;
req->ki_iovec = NULL;
INIT_LIST_HEAD(&req->ki_run_list);

/* Check if the completion queue has enough free space to
Expand Down Expand Up @@ -460,6 +461,8 @@ static inline void really_put_req(struct kioctx *ctx, struct kiocb *req)

if (req->ki_dtor)
req->ki_dtor(req);
if (req->ki_iovec != &req->ki_inline_vec)
kfree(req->ki_iovec);
kmem_cache_free(kiocb_cachep, req);
ctx->reqs_active--;

Expand Down Expand Up @@ -1301,69 +1304,63 @@ asmlinkage long sys_io_destroy(aio_context_t ctx)
return -EINVAL;
}

/*
* aio_p{read,write} are the default ki_retry methods for
* IO_CMD_P{READ,WRITE}. They maintains kiocb retry state around potentially
* multiple calls to f_op->aio_read(). They loop around partial progress
* instead of returning -EIOCBRETRY because they don't have the means to call
* kick_iocb().
*/
static ssize_t aio_pread(struct kiocb *iocb)
static void aio_advance_iovec(struct kiocb *iocb, ssize_t ret)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
ssize_t ret = 0;

do {
iocb->ki_inline_vec.iov_base = iocb->ki_buf;
iocb->ki_inline_vec.iov_len = iocb->ki_left;

ret = file->f_op->aio_read(iocb, &iocb->ki_inline_vec,
1, iocb->ki_pos);
/*
* Can't just depend on iocb->ki_left to determine
* whether we are done. This may have been a short read.
*/
if (ret > 0) {
iocb->ki_buf += ret;
iocb->ki_left -= ret;
struct iovec *iov = &iocb->ki_iovec[iocb->ki_cur_seg];

BUG_ON(ret <= 0);

while (iocb->ki_cur_seg < iocb->ki_nr_segs && ret > 0) {
ssize_t this = min((ssize_t)iov->iov_len, ret);
iov->iov_base += this;
iov->iov_len -= this;
iocb->ki_left -= this;
ret -= this;
if (iov->iov_len == 0) {
iocb->ki_cur_seg++;
iov++;
}
}

/*
* For pipes and sockets we return once we have some data; for
* regular files we retry till we complete the entire read or
* find that we can't read any more data (e.g short reads).
*/
} while (ret > 0 && iocb->ki_left > 0 &&
!S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode));

/* This means we must have transferred all that we could */
/* No need to retry anymore */
if ((ret == 0) || (iocb->ki_left == 0))
ret = iocb->ki_nbytes - iocb->ki_left;

return ret;
/* the caller should not have done more io than what fit in
* the remaining iovecs */
BUG_ON(ret > 0 && iocb->ki_left == 0);
}

/* see aio_pread() */
static ssize_t aio_pwrite(struct kiocb *iocb)
static ssize_t aio_rw_vect_retry(struct kiocb *iocb)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
ssize_t (*rw_op)(struct kiocb *, const struct iovec *,
unsigned long, loff_t);
ssize_t ret = 0;
unsigned short opcode;

if ((iocb->ki_opcode == IOCB_CMD_PREADV) ||
(iocb->ki_opcode == IOCB_CMD_PREAD)) {
rw_op = file->f_op->aio_read;
opcode = IOCB_CMD_PREADV;
} else {
rw_op = file->f_op->aio_write;
opcode = IOCB_CMD_PWRITEV;
}

do {
iocb->ki_inline_vec.iov_base = iocb->ki_buf;
iocb->ki_inline_vec.iov_len = iocb->ki_left;

ret = file->f_op->aio_write(iocb, &iocb->ki_inline_vec,
1, iocb->ki_pos);
if (ret > 0) {
iocb->ki_buf += ret;
iocb->ki_left -= ret;
}
} while (ret > 0 && iocb->ki_left > 0);
ret = rw_op(iocb, &iocb->ki_iovec[iocb->ki_cur_seg],
iocb->ki_nr_segs - iocb->ki_cur_seg,
iocb->ki_pos);
if (ret > 0)
aio_advance_iovec(iocb, ret);

/* retry all partial writes. retry partial reads as long as its a
* regular file. */
} while (ret > 0 && iocb->ki_left > 0 &&
(opcode == IOCB_CMD_PWRITEV ||
(!S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode))));

/* This means we must have transferred all that we could */
/* No need to retry anymore */
if ((ret == 0) || (iocb->ki_left == 0))
ret = iocb->ki_nbytes - iocb->ki_left;

Expand All @@ -1390,6 +1387,38 @@ static ssize_t aio_fsync(struct kiocb *iocb)
return ret;
}

static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb)
{
ssize_t ret;

ret = rw_copy_check_uvector(type, (struct iovec __user *)kiocb->ki_buf,
kiocb->ki_nbytes, 1,
&kiocb->ki_inline_vec, &kiocb->ki_iovec);
if (ret < 0)
goto out;

kiocb->ki_nr_segs = kiocb->ki_nbytes;
kiocb->ki_cur_seg = 0;
/* ki_nbytes/left now reflect bytes instead of segs */
kiocb->ki_nbytes = ret;
kiocb->ki_left = ret;

ret = 0;
out:
return ret;
}

static ssize_t aio_setup_single_vector(struct kiocb *kiocb)
{
kiocb->ki_iovec = &kiocb->ki_inline_vec;
kiocb->ki_iovec->iov_base = kiocb->ki_buf;
kiocb->ki_iovec->iov_len = kiocb->ki_left;
kiocb->ki_nr_segs = 1;
kiocb->ki_cur_seg = 0;
kiocb->ki_nbytes = kiocb->ki_left;
return 0;
}

/*
* aio_setup_iocb:
* Performs the initial checks and aio retry method
Expand All @@ -1412,9 +1441,12 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb)
ret = security_file_permission(file, MAY_READ);
if (unlikely(ret))
break;
ret = aio_setup_single_vector(kiocb);
if (ret)
break;
ret = -EINVAL;
if (file->f_op->aio_read)
kiocb->ki_retry = aio_pread;
kiocb->ki_retry = aio_rw_vect_retry;
break;
case IOCB_CMD_PWRITE:
ret = -EBADF;
Expand All @@ -1427,9 +1459,40 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb)
ret = security_file_permission(file, MAY_WRITE);
if (unlikely(ret))
break;
ret = aio_setup_single_vector(kiocb);
if (ret)
break;
ret = -EINVAL;
if (file->f_op->aio_write)
kiocb->ki_retry = aio_rw_vect_retry;
break;
case IOCB_CMD_PREADV:
ret = -EBADF;
if (unlikely(!(file->f_mode & FMODE_READ)))
break;
ret = security_file_permission(file, MAY_READ);
if (unlikely(ret))
break;
ret = aio_setup_vectored_rw(READ, kiocb);
if (ret)
break;
ret = -EINVAL;
if (file->f_op->aio_read)
kiocb->ki_retry = aio_rw_vect_retry;
break;
case IOCB_CMD_PWRITEV:
ret = -EBADF;
if (unlikely(!(file->f_mode & FMODE_WRITE)))
break;
ret = security_file_permission(file, MAY_WRITE);
if (unlikely(ret))
break;
ret = aio_setup_vectored_rw(WRITE, kiocb);
if (ret)
break;
ret = -EINVAL;
if (file->f_op->aio_write)
kiocb->ki_retry = aio_pwrite;
kiocb->ki_retry = aio_rw_vect_retry;
break;
case IOCB_CMD_FDSYNC:
ret = -EINVAL;
Expand Down
129 changes: 75 additions & 54 deletions fs/read_write.c
Original file line number Diff line number Diff line change
Expand Up @@ -511,6 +511,74 @@ ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,
/* A write operation does a read from user space and vice versa */
#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)

ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
unsigned long nr_segs, unsigned long fast_segs,
struct iovec *fast_pointer,
struct iovec **ret_pointer)
{
unsigned long seg;
ssize_t ret;
struct iovec *iov = fast_pointer;

/*
* SuS says "The readv() function *may* fail if the iovcnt argument
* was less than or equal to 0, or greater than {IOV_MAX}. Linux has
* traditionally returned zero for zero segments, so...
*/
if (nr_segs == 0) {
ret = 0;
goto out;
}

/*
* First get the "struct iovec" from user memory and
* verify all the pointers
*/
if (nr_segs > UIO_MAXIOV) {
ret = -EINVAL;
goto out;
}
if (nr_segs > fast_segs) {
iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
if (iov == NULL) {
ret = -ENOMEM;
goto out;
}
}
if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
ret = -EFAULT;
goto out;
}

/*
* According to the Single Unix Specification we should return EINVAL
* if an element length is < 0 when cast to ssize_t or if the
* total length would overflow the ssize_t return value of the
* system call.
*/
ret = 0;
for (seg = 0; seg < nr_segs; seg++) {
void __user *buf = iov[seg].iov_base;
ssize_t len = (ssize_t)iov[seg].iov_len;

/* see if we we're about to use an invalid len or if
* it's about to overflow ssize_t */
if (len < 0 || (ret + len < ret)) {
ret = -EINVAL;
goto out;
}
if (unlikely(!access_ok(vrfy_dir(type), buf, len))) {
ret = -EFAULT;
goto out;
}

ret += len;
}
out:
*ret_pointer = iov;
return ret;
}

static ssize_t do_readv_writev(int type, struct file *file,
const struct iovec __user * uvector,
unsigned long nr_segs, loff_t *pos)
Expand All @@ -519,64 +587,20 @@ static ssize_t do_readv_writev(int type, struct file *file,
struct iovec iovstack[UIO_FASTIOV];
struct iovec *iov = iovstack;
ssize_t ret;
int seg;
io_fn_t fn;
iov_fn_t fnv;

/*
* SuS says "The readv() function *may* fail if the iovcnt argument
* was less than or equal to 0, or greater than {IOV_MAX}. Linux has
* traditionally returned zero for zero segments, so...
*/
ret = 0;
if (nr_segs == 0)
goto out;

/*
* First get the "struct iovec" from user memory and
* verify all the pointers
*/
ret = -EINVAL;
if (nr_segs > UIO_MAXIOV)
goto out;
if (!file->f_op)
goto out;
if (nr_segs > UIO_FASTIOV) {
ret = -ENOMEM;
iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
if (!iov)
goto out;
}
ret = -EFAULT;
if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector)))
if (!file->f_op) {
ret = -EINVAL;
goto out;
}

/*
* Single unix specification:
* We should -EINVAL if an element length is not >= 0 and fitting an
* ssize_t. The total length is fitting an ssize_t
*
* Be careful here because iov_len is a size_t not an ssize_t
*/
tot_len = 0;
ret = -EINVAL;
for (seg = 0; seg < nr_segs; seg++) {
void __user *buf = iov[seg].iov_base;
ssize_t len = (ssize_t)iov[seg].iov_len;

if (len < 0) /* size_t not fitting an ssize_t .. */
goto out;
if (unlikely(!access_ok(vrfy_dir(type), buf, len)))
goto Efault;
tot_len += len;
if ((ssize_t)tot_len < 0) /* maths overflow on the ssize_t */
goto out;
}
if (tot_len == 0) {
ret = 0;
ret = rw_copy_check_uvector(type, uvector, nr_segs,
ARRAY_SIZE(iovstack), iovstack, &iov);
if (ret <= 0)
goto out;
}

tot_len = ret;
ret = rw_verify_area(type, file, pos, tot_len);
if (ret < 0)
goto out;
Expand Down Expand Up @@ -609,9 +633,6 @@ static ssize_t do_readv_writev(int type, struct file *file,
fsnotify_modify(file->f_dentry);
}
return ret;
Efault:
ret = -EFAULT;
goto out;
}

ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
Expand Down
Loading

0 comments on commit eed4e51

Please sign in to comment.