From e82051193a171f393d2a165a7ce18d8a2e2b4837 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 7 Apr 2024 02:42:36 -0400 Subject: [PATCH 1/4] new helper: copy_to_iter_full() ... and convert copy_linear_skb() to using that. Signed-off-by: Al Viro --- include/linux/uio.h | 10 ++++++++++ include/net/udp.h | 9 +-------- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/include/linux/uio.h b/include/linux/uio.h index 00cebe2b70de7..7020adedfa08c 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -205,6 +205,16 @@ size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) return 0; } +static __always_inline __must_check +bool copy_to_iter_full(const void *addr, size_t bytes, struct iov_iter *i) +{ + size_t copied = copy_to_iter(addr, bytes, i); + if (likely(copied == bytes)) + return true; + iov_iter_revert(i, copied); + return false; +} + static __always_inline __must_check bool copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i) { diff --git a/include/net/udp.h b/include/net/udp.h index 488a6d2babccf..c4e05b14b648a 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -379,14 +379,7 @@ static inline bool udp_skb_is_linear(struct sk_buff *skb) static inline int copy_linear_skb(struct sk_buff *skb, int len, int off, struct iov_iter *to) { - int n; - - n = copy_to_iter(skb->data + off, len, to); - if (n == len) - return 0; - - iov_iter_revert(to, n); - return -EFAULT; + return copy_to_iter_full(skb->data + off, len, to) ? 0 : -EFAULT; } /* From d94979904105a7ad8dca6fdcd8cb3fbecada22f1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 1 Apr 2024 11:48:37 -0600 Subject: [PATCH 2/4] timerfd: convert to ->read_iter() Switch timerfd to using fops->read_iter(), so it can support not just O_NONBLOCK but IOCB_NOWAIT as well. With the latter, users like io_uring interact with timerfds a lot better, as they can be driven purely by the poll trigger. Manually get and install the required fd, so that FMODE_NOWAIT can be set before the file is installed into the file table. No functional changes intended in this patch, it's purely a straight conversion to using the read iterator method. Signed-off-by: Jens Axboe --- fs/timerfd.c | 36 ++++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/fs/timerfd.c b/fs/timerfd.c index e9c96a0c79f11..4bf2f8bfec112 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -262,17 +262,18 @@ static __poll_t timerfd_poll(struct file *file, poll_table *wait) return events; } -static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count, - loff_t *ppos) +static ssize_t timerfd_read_iter(struct kiocb *iocb, struct iov_iter *to) { + struct file *file = iocb->ki_filp; struct timerfd_ctx *ctx = file->private_data; ssize_t res; u64 ticks = 0; - if (count < sizeof(ticks)) + if (iov_iter_count(to) < sizeof(ticks)) return -EINVAL; + spin_lock_irq(&ctx->wqh.lock); - if (file->f_flags & O_NONBLOCK) + if (file->f_flags & O_NONBLOCK || iocb->ki_flags & IOCB_NOWAIT) res = -EAGAIN; else res = wait_event_interruptible_locked_irq(ctx->wqh, ctx->ticks); @@ -312,8 +313,11 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count, ctx->ticks = 0; } spin_unlock_irq(&ctx->wqh.lock); - if (ticks) - res = put_user(ticks, (u64 __user *) buf) ? -EFAULT: sizeof(ticks); + if (ticks) { + res = copy_to_iter(&ticks, sizeof(ticks), to); + if (!res) + res = -EFAULT; + } return res; } @@ -384,7 +388,7 @@ static long timerfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg static const struct file_operations timerfd_fops = { .release = timerfd_release, .poll = timerfd_poll, - .read = timerfd_read, + .read_iter = timerfd_read_iter, .llseek = noop_llseek, .show_fdinfo = timerfd_show, .unlocked_ioctl = timerfd_ioctl, @@ -407,6 +411,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) { int ufd; struct timerfd_ctx *ctx; + struct file *file; /* Check the TFD_* constants for consistency. */ BUILD_BUG_ON(TFD_CLOEXEC != O_CLOEXEC); @@ -443,11 +448,22 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) ctx->moffs = ktime_mono_to_real(0); - ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx, - O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS)); - if (ufd < 0) + ufd = get_unused_fd_flags(flags & TFD_SHARED_FCNTL_FLAGS); + if (ufd < 0) { + kfree(ctx); + return ufd; + } + + file = anon_inode_getfile("[timerfd]", &timerfd_fops, ctx, + O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS)); + if (IS_ERR(file)) { + put_unused_fd(ufd); kfree(ctx); + return PTR_ERR(file); + } + file->f_mode |= FMODE_NOWAIT; + fd_install(ufd, file); return ufd; } From 40f45fe8eb7efd70e772447dc98bb50c5e323ccb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 2 Apr 2024 12:26:41 -0600 Subject: [PATCH 3/4] userfaultfd: convert to ->read_iter() Rather than use the older style ->read() hook, use ->read_iter() so that userfaultfd can support both O_NONBLOCK and IOCB_NOWAIT for non-blocking read attempts. Split the fd setup into two parts, so that userfaultfd can mark the file mode with FMODE_NOWAIT before installing it into the process table. With that, we can also defer grabbing the mm until we know the rest will succeed, as the fd isn't visible before then. Signed-off-by: Jens Axboe --- fs/userfaultfd.c | 44 ++++++++++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 60dcfafdc11a8..6d963402c8350 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -31,6 +31,7 @@ #include #include #include +#include static int sysctl_unprivileged_userfaultfd __read_mostly; @@ -282,7 +283,7 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, /* * Verify the pagetables are still not ok after having reigstered into * the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any - * userfault that has already been resolved, if userfaultfd_read and + * userfault that has already been resolved, if userfaultfd_read_iter and * UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different * threads. */ @@ -1177,34 +1178,34 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, return ret; } -static ssize_t userfaultfd_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) +static ssize_t userfaultfd_read_iter(struct kiocb *iocb, struct iov_iter *to) { + struct file *file = iocb->ki_filp; struct userfaultfd_ctx *ctx = file->private_data; ssize_t _ret, ret = 0; struct uffd_msg msg; - int no_wait = file->f_flags & O_NONBLOCK; struct inode *inode = file_inode(file); + bool no_wait; if (!userfaultfd_is_initialized(ctx)) return -EINVAL; + no_wait = file->f_flags & O_NONBLOCK || iocb->ki_flags & IOCB_NOWAIT; for (;;) { - if (count < sizeof(msg)) + if (iov_iter_count(to) < sizeof(msg)) return ret ? ret : -EINVAL; _ret = userfaultfd_ctx_read(ctx, no_wait, &msg, inode); if (_ret < 0) return ret ? ret : _ret; - if (copy_to_user((__u64 __user *) buf, &msg, sizeof(msg))) + _ret = !copy_to_iter_full(&msg, sizeof(msg), to); + if (_ret) return ret ? ret : -EFAULT; ret += sizeof(msg); - buf += sizeof(msg); - count -= sizeof(msg); /* * Allow to read more than one fault at time but only * block if waiting for the very first one. */ - no_wait = O_NONBLOCK; + no_wait = true; } } @@ -2172,7 +2173,7 @@ static const struct file_operations userfaultfd_fops = { #endif .release = userfaultfd_release, .poll = userfaultfd_poll, - .read = userfaultfd_read, + .read_iter = userfaultfd_read_iter, .unlocked_ioctl = userfaultfd_ioctl, .compat_ioctl = compat_ptr_ioctl, .llseek = noop_llseek, @@ -2192,6 +2193,7 @@ static void init_once_userfaultfd_ctx(void *mem) static int new_userfaultfd(int flags) { struct userfaultfd_ctx *ctx; + struct file *file; int fd; BUG_ON(!current->mm); @@ -2215,16 +2217,26 @@ static int new_userfaultfd(int flags) init_rwsem(&ctx->map_changing_lock); atomic_set(&ctx->mmap_changing, 0); ctx->mm = current->mm; - /* prevent the mm struct to be freed */ - mmgrab(ctx->mm); + + fd = get_unused_fd_flags(flags & UFFD_SHARED_FCNTL_FLAGS); + if (fd < 0) + goto err_out; /* Create a new inode so that the LSM can block the creation. */ - fd = anon_inode_create_getfd("[userfaultfd]", &userfaultfd_fops, ctx, + file = anon_inode_create_getfile("[userfaultfd]", &userfaultfd_fops, ctx, O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS), NULL); - if (fd < 0) { - mmdrop(ctx->mm); - kmem_cache_free(userfaultfd_ctx_cachep, ctx); + if (IS_ERR(file)) { + put_unused_fd(fd); + fd = PTR_ERR(file); + goto err_out; } + /* prevent the mm struct to be freed */ + mmgrab(ctx->mm); + file->f_mode |= FMODE_NOWAIT; + fd_install(fd, file); + return fd; +err_out: + kmem_cache_free(userfaultfd_ctx_cachep, ctx); return fd; } From fbe38120eb1dec94280d0381ce4aea52c44367b1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 2 Apr 2024 12:12:18 -0600 Subject: [PATCH 4/4] signalfd: convert to ->read_iter() Rather than use the older style ->read() hook, use ->read_iter() so that signalfd can support both O_NONBLOCK and IOCB_NOWAIT for non-blocking read attempts. Split the fd setup into two parts, so that signalfd can mark the file mode with FMODE_NOWAIT before installing it into the process table. Signed-off-by: Jens Axboe --- fs/signalfd.c | 44 ++++++++++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/fs/signalfd.c b/fs/signalfd.c index e20d1484c6633..4a5614442dbfa 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -68,8 +68,7 @@ static __poll_t signalfd_poll(struct file *file, poll_table *wait) /* * Copied from copy_siginfo_to_user() in kernel/signal.c */ -static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo, - kernel_siginfo_t const *kinfo) +static int signalfd_copyinfo(struct iov_iter *to, kernel_siginfo_t const *kinfo) { struct signalfd_siginfo new; @@ -146,10 +145,10 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo, break; } - if (copy_to_user(uinfo, &new, sizeof(struct signalfd_siginfo))) + if (!copy_to_iter_full(&new, sizeof(struct signalfd_siginfo), to)) return -EFAULT; - return sizeof(*uinfo); + return sizeof(struct signalfd_siginfo); } static ssize_t signalfd_dequeue(struct signalfd_ctx *ctx, kernel_siginfo_t *info, @@ -199,28 +198,27 @@ static ssize_t signalfd_dequeue(struct signalfd_ctx *ctx, kernel_siginfo_t *info * error code. The "count" parameter must be at least the size of a * "struct signalfd_siginfo". */ -static ssize_t signalfd_read(struct file *file, char __user *buf, size_t count, - loff_t *ppos) +static ssize_t signalfd_read_iter(struct kiocb *iocb, struct iov_iter *to) { + struct file *file = iocb->ki_filp; struct signalfd_ctx *ctx = file->private_data; - struct signalfd_siginfo __user *siginfo; - int nonblock = file->f_flags & O_NONBLOCK; + size_t count = iov_iter_count(to); ssize_t ret, total = 0; kernel_siginfo_t info; + bool nonblock; count /= sizeof(struct signalfd_siginfo); if (!count) return -EINVAL; - siginfo = (struct signalfd_siginfo __user *) buf; + nonblock = file->f_flags & O_NONBLOCK || iocb->ki_flags & IOCB_NOWAIT; do { ret = signalfd_dequeue(ctx, &info, nonblock); if (unlikely(ret <= 0)) break; - ret = signalfd_copyinfo(siginfo, &info); + ret = signalfd_copyinfo(to, &info); if (ret < 0) break; - siginfo++; total += ret; nonblock = 1; } while (--count); @@ -246,7 +244,7 @@ static const struct file_operations signalfd_fops = { #endif .release = signalfd_release, .poll = signalfd_poll, - .read = signalfd_read, + .read_iter = signalfd_read_iter, .llseek = noop_llseek, }; @@ -265,20 +263,34 @@ static int do_signalfd4(int ufd, sigset_t *mask, int flags) signotset(mask); if (ufd == -1) { + struct file *file; + ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return -ENOMEM; ctx->sigmask = *mask; + ufd = get_unused_fd_flags(flags & O_CLOEXEC); + if (ufd < 0) { + kfree(ctx); + return ufd; + } + + file = anon_inode_getfile("[signalfd]", &signalfd_fops, ctx, + O_RDWR | (flags & O_NONBLOCK)); + if (IS_ERR(file)) { + put_unused_fd(ufd); + kfree(ctx); + return ufd; + } + file->f_mode |= FMODE_NOWAIT; + /* * When we call this, the initialization must be complete, since * anon_inode_getfd() will install the fd. */ - ufd = anon_inode_getfd("[signalfd]", &signalfd_fops, ctx, - O_RDWR | (flags & (O_CLOEXEC | O_NONBLOCK))); - if (ufd < 0) - kfree(ctx); + fd_install(ufd, file); } else { struct fd f = fdget(ufd); if (!f.file)