Skip to content

Commit

Permalink
Merge tag 'fuse-update-5.8' of git://git.kernel.org/pub/scm/linux/ker…
Browse files Browse the repository at this point in the history
…nel/git/mszeredi/fuse

Pull fuse updates from Miklos Szeredi:

 - Fix a rare deadlock in virtiofs

 - Fix st_blocks in writeback cache mode

 - Fix wrong checks in splice move causing spurious warnings

 - Fix a race between a GETATTR request and a FUSE_NOTIFY_INVAL_INODE
   notification

 - Use rb-tree instead of linear search for pages currently under
   writeout by userspace

 - Fix copy_file_range() inconsistencies

* tag 'fuse-update-5.8' of git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/fuse:
  fuse: copy_file_range should truncate cache
  fuse: fix copy_file_range cache issues
  fuse: optimize writepages search
  fuse: update attr_version counter on fuse_notify_inval_inode()
  fuse: don't check refcount after stealing page
  fuse: fix weird page warning
  fuse: use dump_page
  virtiofs: do not use fuse_fill_super_common() for device installation
  fuse: always allow query of st_dev
  fuse: always flush dirty data on close(2)
  fuse: invalidate inode attr in writeback cache mode
  fuse: Update stale comment in queue_interrupt()
  fuse: BUG_ON correction in fuse_dev_splice_write()
  virtiofs: Add mount option and atime behavior to the doc
  virtiofs: schedule blocking async replies in separate worker
  • Loading branch information
Linus Torvalds committed Jun 9, 2020
2 parents 52435c8 + 9b46418 commit 5b14671
Showing 7 changed files with 219 additions and 85 deletions.
14 changes: 14 additions & 0 deletions Documentation/filesystems/virtiofs.rst
Original file line number Diff line number Diff line change
@@ -39,6 +39,20 @@ Mount file system with tag ``myfs`` on ``/mnt``:
Please see https://virtio-fs.gitlab.io/ for details on how to configure QEMU
and the virtiofsd daemon.

Mount options
-------------

virtiofs supports general VFS mount options, for example, remount,
ro, rw, context, etc. It also supports FUSE mount options.

atime behavior
^^^^^^^^^^^^^^

The atime-related mount options, for example, noatime, strictatime,
are ignored. The atime behavior for virtiofs is the same as the
underlying filesystem of the directory that has been exported
on the host.

Internals
=========
Since the virtio-fs device uses the FUSE protocol for file system requests, the
14 changes: 7 additions & 7 deletions fs/fuse/dev.c
Original file line number Diff line number Diff line change
@@ -342,7 +342,7 @@ static int queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
list_add_tail(&req->intr_entry, &fiq->interrupts);
/*
* Pairs with smp_mb() implied by test_and_set_bit()
* from request_end().
* from fuse_request_end().
*/
smp_mb();
if (test_bit(FR_FINISHED, &req->flags)) {
@@ -764,16 +764,15 @@ static int fuse_check_page(struct page *page)
{
if (page_mapcount(page) ||
page->mapping != NULL ||
page_count(page) != 1 ||
(page->flags & PAGE_FLAGS_CHECK_AT_PREP &
~(1 << PG_locked |
1 << PG_referenced |
1 << PG_uptodate |
1 << PG_lru |
1 << PG_active |
1 << PG_reclaim))) {
pr_warn("trying to steal weird page\n");
pr_warn(" page=%p index=%li flags=%08lx, count=%i, mapcount=%i, mapping=%p\n", page, page->index, page->flags, page_count(page), page_mapcount(page), page->mapping);
1 << PG_reclaim |
1 << PG_waiters))) {
dump_page(page, "fuse: trying to steal weird page");
return 1;
}
return 0;
@@ -1977,8 +1976,9 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
struct pipe_buffer *ibuf;
struct pipe_buffer *obuf;

BUG_ON(nbuf >= pipe->ring_size);
BUG_ON(tail == head);
if (WARN_ON(nbuf >= count || tail == head))
goto out_free;

ibuf = &pipe->bufs[tail & mask];
obuf = &bufs[nbuf];

12 changes: 11 additions & 1 deletion fs/fuse/dir.c
Original file line number Diff line number Diff line change
@@ -1689,8 +1689,18 @@ static int fuse_getattr(const struct path *path, struct kstat *stat,
struct inode *inode = d_inode(path->dentry);
struct fuse_conn *fc = get_fuse_conn(inode);

if (!fuse_allow_current_process(fc))
if (!fuse_allow_current_process(fc)) {
if (!request_mask) {
/*
* If user explicitly requested *nothing* then don't
* error out, but return st_dev only.
*/
stat->result_mask = 0;
stat->dev = inode->i_sb->s_dev;
return 0;
}
return -EACCES;
}

return fuse_update_get_attr(inode, NULL, stat, request_mask, flags);
}
120 changes: 92 additions & 28 deletions fs/fuse/file.c
Original file line number Diff line number Diff line change
@@ -357,7 +357,7 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id)

struct fuse_writepage_args {
struct fuse_io_args ia;
struct list_head writepages_entry;
struct rb_node writepages_entry;
struct list_head queue_entry;
struct fuse_writepage_args *next;
struct inode *inode;
@@ -366,17 +366,23 @@ struct fuse_writepage_args {
static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi,
pgoff_t idx_from, pgoff_t idx_to)
{
struct fuse_writepage_args *wpa;
struct rb_node *n;

n = fi->writepages.rb_node;

list_for_each_entry(wpa, &fi->writepages, writepages_entry) {
while (n) {
struct fuse_writepage_args *wpa;
pgoff_t curr_index;

wpa = rb_entry(n, struct fuse_writepage_args, writepages_entry);
WARN_ON(get_fuse_inode(wpa->inode) != fi);
curr_index = wpa->ia.write.in.offset >> PAGE_SHIFT;
if (idx_from < curr_index + wpa->ia.ap.num_pages &&
curr_index <= idx_to) {
if (idx_from >= curr_index + wpa->ia.ap.num_pages)
n = n->rb_right;
else if (idx_to < curr_index)
n = n->rb_left;
else
return wpa;
}
}
return NULL;
}
@@ -445,9 +451,6 @@ static int fuse_flush(struct file *file, fl_owner_t id)
if (is_bad_inode(inode))
return -EIO;

if (fc->no_flush)
return 0;

err = write_inode_now(inode, 1);
if (err)
return err;
@@ -460,6 +463,10 @@ static int fuse_flush(struct file *file, fl_owner_t id)
if (err)
return err;

err = 0;
if (fc->no_flush)
goto inval_attr_out;

memset(&inarg, 0, sizeof(inarg));
inarg.fh = ff->fh;
inarg.lock_owner = fuse_lock_owner_id(fc, id);
@@ -475,6 +482,14 @@ static int fuse_flush(struct file *file, fl_owner_t id)
fc->no_flush = 1;
err = 0;
}

inval_attr_out:
/*
* In memory i_blocks is not maintained by fuse, if writeback cache is
* enabled, i_blocks from cached attr may not be accurate.
*/
if (!err && fc->writeback_cache)
fuse_invalidate_attr(inode);
return err;
}

@@ -712,6 +727,7 @@ static ssize_t fuse_async_req_send(struct fuse_conn *fc,
spin_unlock(&io->lock);

ia->ap.args.end = fuse_aio_complete_req;
ia->ap.args.may_block = io->should_dirty;
err = fuse_simple_background(fc, &ia->ap.args, GFP_KERNEL);
if (err)
fuse_aio_complete_req(fc, &ia->ap.args, err);
@@ -1570,7 +1586,7 @@ static void fuse_writepage_finish(struct fuse_conn *fc,
struct backing_dev_info *bdi = inode_to_bdi(inode);
int i;

list_del(&wpa->writepages_entry);
rb_erase(&wpa->writepages_entry, &fi->writepages);
for (i = 0; i < ap->num_pages; i++) {
dec_wb_stat(&bdi->wb, WB_WRITEBACK);
dec_node_page_state(ap->pages[i], NR_WRITEBACK_TEMP);
@@ -1658,6 +1674,36 @@ __acquires(fi->lock)
}
}

static void tree_insert(struct rb_root *root, struct fuse_writepage_args *wpa)
{
pgoff_t idx_from = wpa->ia.write.in.offset >> PAGE_SHIFT;
pgoff_t idx_to = idx_from + wpa->ia.ap.num_pages - 1;
struct rb_node **p = &root->rb_node;
struct rb_node *parent = NULL;

WARN_ON(!wpa->ia.ap.num_pages);
while (*p) {
struct fuse_writepage_args *curr;
pgoff_t curr_index;

parent = *p;
curr = rb_entry(parent, struct fuse_writepage_args,
writepages_entry);
WARN_ON(curr->inode != wpa->inode);
curr_index = curr->ia.write.in.offset >> PAGE_SHIFT;

if (idx_from >= curr_index + curr->ia.ap.num_pages)
p = &(*p)->rb_right;
else if (idx_to < curr_index)
p = &(*p)->rb_left;
else
return (void) WARN_ON(true);
}

rb_link_node(&wpa->writepages_entry, parent, p);
rb_insert_color(&wpa->writepages_entry, root);
}

static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args,
int error)
{
@@ -1676,7 +1722,7 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args,
wpa->next = next->next;
next->next = NULL;
next->ia.ff = fuse_file_get(wpa->ia.ff);
list_add(&next->writepages_entry, &fi->writepages);
tree_insert(&fi->writepages, next);

/*
* Skip fuse_flush_writepages() to make it easy to crop requests
@@ -1811,7 +1857,7 @@ static int fuse_writepage_locked(struct page *page)
inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP);

spin_lock(&fi->lock);
list_add(&wpa->writepages_entry, &fi->writepages);
tree_insert(&fi->writepages, wpa);
list_add_tail(&wpa->queue_entry, &fi->queued_writes);
fuse_flush_writepages(inode);
spin_unlock(&fi->lock);
@@ -1923,10 +1969,10 @@ static bool fuse_writepage_in_flight(struct fuse_writepage_args *new_wpa,
WARN_ON(new_ap->num_pages != 0);

spin_lock(&fi->lock);
list_del(&new_wpa->writepages_entry);
rb_erase(&new_wpa->writepages_entry, &fi->writepages);
old_wpa = fuse_find_writeback(fi, page->index, page->index);
if (!old_wpa) {
list_add(&new_wpa->writepages_entry, &fi->writepages);
tree_insert(&fi->writepages, new_wpa);
spin_unlock(&fi->lock);
return false;
}
@@ -2041,7 +2087,7 @@ static int fuse_writepages_fill(struct page *page,
wpa->inode = inode;

spin_lock(&fi->lock);
list_add(&wpa->writepages_entry, &fi->writepages);
tree_insert(&fi->writepages, wpa);
spin_unlock(&fi->lock);

data->wpa = wpa;
@@ -3235,25 +3281,39 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
return -EXDEV;

if (fc->writeback_cache) {
inode_lock(inode_in);
err = fuse_writeback_range(inode_in, pos_in, pos_in + len);
inode_unlock(inode_in);
if (err)
return err;
}
inode_lock(inode_in);
err = fuse_writeback_range(inode_in, pos_in, pos_in + len - 1);
inode_unlock(inode_in);
if (err)
return err;

inode_lock(inode_out);

err = file_modified(file_out);
if (err)
goto out;

if (fc->writeback_cache) {
err = fuse_writeback_range(inode_out, pos_out, pos_out + len);
if (err)
goto out;
}
/*
* Write out dirty pages in the destination file before sending the COPY
* request to userspace. After the request is completed, truncate off
* pages (including partial ones) from the cache that have been copied,
* since these contain stale data at that point.
*
* This should be mostly correct, but if the COPY writes to partial
* pages (at the start or end) and the parts not covered by the COPY are
* written through a memory map after calling fuse_writeback_range(),
* then these partial page modifications will be lost on truncation.
*
* It is unlikely that someone would rely on such mixed style
* modifications. Yet this does give less guarantees than if the
* copying was performed with write(2).
*
* To fix this a i_mmap_sem style lock could be used to prevent new
* faults while the copy is ongoing.
*/
err = fuse_writeback_range(inode_out, pos_out, pos_out + len - 1);
if (err)
goto out;

if (is_unstable)
set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state);
@@ -3274,6 +3334,10 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
if (err)
goto out;

truncate_inode_pages_range(inode_out->i_mapping,
ALIGN_DOWN(pos_out, PAGE_SIZE),
ALIGN(pos_out + outarg.size, PAGE_SIZE) - 1);

if (fc->writeback_cache) {
fuse_write_update_size(inode_out, pos_out + outarg.size);
file_update_time(file_out);
@@ -3351,5 +3415,5 @@ void fuse_init_file_inode(struct inode *inode)
INIT_LIST_HEAD(&fi->queued_writes);
fi->writectr = 0;
init_waitqueue_head(&fi->page_waitq);
INIT_LIST_HEAD(&fi->writepages);
fi->writepages = RB_ROOT;
}
3 changes: 2 additions & 1 deletion fs/fuse/fuse_i.h
Original file line number Diff line number Diff line change
@@ -111,7 +111,7 @@ struct fuse_inode {
wait_queue_head_t page_waitq;

/* List of writepage requestst (pending or sent) */
struct list_head writepages;
struct rb_root writepages;
};

/* readdir cache (directory only) */
@@ -249,6 +249,7 @@ struct fuse_args {
bool out_argvar:1;
bool page_zeroing:1;
bool page_replace:1;
bool may_block:1;
struct fuse_in_arg in_args[3];
struct fuse_arg out_args[2];
void (*end)(struct fuse_conn *fc, struct fuse_args *args, int error);
Loading

0 comments on commit 5b14671

Please sign in to comment.