From 428f651cb80b227af47fc302e4931791f2fb4741 Mon Sep 17 00:00:00 2001 From: Bob Peterson <rpeterso@redhat.com> Date: Mon, 17 Jan 2022 10:25:07 -0500 Subject: [PATCH 01/15] gfs2: assign rgrp glock before compute_bitstructs Before this patch, function read_rindex_entry called compute_bitstructs before it allocated a glock for the rgrp. But if compute_bitstructs found a problem with the rgrp, it called gfs2_consist_rgrpd, and that called gfs2_dump_glock for rgd->rd_gl which had not yet been assigned. read_rindex_entry compute_bitstructs gfs2_consist_rgrpd gfs2_dump_glock <---------rgd->rd_gl was not set. This patch changes read_rindex_entry so it assigns an rgrp glock before calling compute_bitstructs so gfs2_dump_glock does not reference an unassigned pointer. If an error is discovered, the glock must also be put, so a new goto and label were added. Reported-by: syzbot+c6fd14145e2f62ca0784@syzkaller.appspotmail.com Signed-off-by: Bob Peterson <rpeterso@redhat.com> Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com> --- fs/gfs2/rgrp.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 0fb3c01bc5577..9b04a570c5826 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -922,15 +922,15 @@ static int read_rindex_entry(struct gfs2_inode *ip) spin_lock_init(&rgd->rd_rsspin); mutex_init(&rgd->rd_mutex); - error = compute_bitstructs(rgd); - if (error) - goto fail; - error = gfs2_glock_get(sdp, rgd->rd_addr, &gfs2_rgrp_glops, CREATE, &rgd->rd_gl); if (error) goto fail; + error = compute_bitstructs(rgd); + if (error) + goto fail_glock; + rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr; rgd->rd_flags &= ~GFS2_RDF_PREFERRED; if (rgd->rd_data > sdp->sd_max_rg_data) @@ -944,6 +944,7 @@ static int read_rindex_entry(struct gfs2_inode *ip) } error = 0; /* someone else read in the rgrp; free it and ignore it */ +fail_glock: gfs2_glock_put(rgd->rd_gl); fail: From 7336905a89f19173bf9301cd50a24421162f417c Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher <agruenba@redhat.com> Date: Fri, 10 Dec 2021 14:43:36 +0100 Subject: [PATCH 02/15] gfs2: gfs2_setattr_size error path fix When gfs2_setattr_size() fails, it calls gfs2_rs_delete(ip, NULL) to get rid of any reservations the inode may have. Instead, it should pass in the inode's write count as the second parameter to allow gfs2_rs_delete() to figure out if the inode has any writers left. In a next step, there are two instances of gfs2_rs_delete(ip, NULL) left where we know that there can be no other users of the inode. Replace those with gfs2_rs_deltree(&ip->i_res) to avoid the unnecessary write count check. With that, gfs2_rs_delete() is only called with the inode's actual write count, so get rid of the second parameter. Fixes: a097dc7e24cb ("GFS2: Make rgrp reservations part of the gfs2_inode structure") Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com> --- fs/gfs2/bmap.c | 2 +- fs/gfs2/file.c | 2 +- fs/gfs2/inode.c | 2 +- fs/gfs2/rgrp.c | 7 ++++--- fs/gfs2/rgrp.h | 2 +- fs/gfs2/super.c | 2 +- 6 files changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index d67108489148e..fbdb7a30470a3 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -2146,7 +2146,7 @@ int gfs2_setattr_size(struct inode *inode, u64 newsize) ret = do_shrink(inode, newsize); out: - gfs2_rs_delete(ip, NULL); + gfs2_rs_delete(ip); gfs2_qa_put(ip); return ret; } diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 8c39a8571b1fa..5bac4f6e8e057 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -706,7 +706,7 @@ static int gfs2_release(struct inode *inode, struct file *file) if (file->f_mode & FMODE_WRITE) { if (gfs2_rs_active(&ip->i_res)) - gfs2_rs_delete(ip, &inode->i_writecount); + gfs2_rs_delete(ip); gfs2_qa_put(ip); } return 0; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 89905f4f29bb6..66a123306aecb 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -793,7 +793,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (free_vfs_inode) /* else evict will do the put for us */ gfs2_glock_put(ip->i_gl); } - gfs2_rs_delete(ip, NULL); + gfs2_rs_deltree(&ip->i_res); gfs2_qa_put(ip); fail_free_acls: posix_acl_release(default_acl); diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 9b04a570c5826..9df8a84f85a64 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -680,13 +680,14 @@ void gfs2_rs_deltree(struct gfs2_blkreserv *rs) /** * gfs2_rs_delete - delete a multi-block reservation * @ip: The inode for this reservation - * @wcount: The inode's write count, or NULL * */ -void gfs2_rs_delete(struct gfs2_inode *ip, atomic_t *wcount) +void gfs2_rs_delete(struct gfs2_inode *ip) { + struct inode *inode = &ip->i_inode; + down_write(&ip->i_rw_mutex); - if ((wcount == NULL) || (atomic_read(wcount) <= 1)) + if (atomic_read(&inode->i_writecount) <= 1) gfs2_rs_deltree(&ip->i_res); up_write(&ip->i_rw_mutex); } diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index 3e2ca1fb43056..46dd94e9e085c 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -45,7 +45,7 @@ extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n, bool dinode, u64 *generation); extern void gfs2_rs_deltree(struct gfs2_blkreserv *rs); -extern void gfs2_rs_delete(struct gfs2_inode *ip, atomic_t *wcount); +extern void gfs2_rs_delete(struct gfs2_inode *ip); extern void __gfs2_free_blocks(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd, u64 bstart, u32 blen, int meta); extern void gfs2_free_meta(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd, diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 64c67090f5036..143a47359d1b8 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1396,7 +1396,7 @@ static void gfs2_evict_inode(struct inode *inode) truncate_inode_pages_final(&inode->i_data); if (ip->i_qadata) gfs2_assert_warn(sdp, ip->i_qadata->qa_ref == 0); - gfs2_rs_delete(ip, NULL); + gfs2_rs_deltree(&ip->i_res); gfs2_ordered_del_inode(ip); clear_inode(inode); gfs2_dir_hash_inval(ip); From a892b12393afb74c397dc752eaba1e96e2702303 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher <agruenba@redhat.com> Date: Wed, 2 Feb 2022 11:00:48 +0100 Subject: [PATCH 03/15] gfs2: Expect -EBUSY after canceling dlm locking requests Due to the asynchronous nature of the dlm api, when we request a pending locking request to be canceled with dlm_unlock(DLM_LKF_CANCEL), the locking request will either complete before it could be canceled, or the cancellation will succeed. In either case, gdlm_ast will be called once and the status will indicate the outcome of the locking request, with -DLM_ECANCEL indicating a canceled request. Inside dlm, when a locking request completes before its cancel request could be processed, gdlm_ast will be called, but the lock will still be considered busy until a DLM_MSG_CANCEL_REPLY message completes the cancel request. During that time, successive dlm_lock() or dlm_unlock() requests for that lock will return -EBUSY. In other words, waiting for the gdlm_ast call before issuing the next locking request is not enough. There is no way of waiting for a cancel request to actually complete, either. We rarely cancel locking requests, but when we do, we don't know when the next locking request for that lock will occur. This means that any dlm_lock() or dlm_unlock() call can potentially return -EBUSY. When that happens, this patch simply repeats the request after a short pause. This workaround could be improved upon by tracking for which dlm locks cancel requests have been issued, but that isn't strictly necessary and it would complicate the code. We haven't seen -EBUSY errors from dlm without cancel requests. Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com> --- fs/gfs2/lock_dlm.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 50578f881e6de..2559a79cf14be 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -261,6 +261,7 @@ static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state, int req; u32 lkf; char strname[GDLM_STRNAME_BYTES] = ""; + int error; req = make_mode(gl->gl_name.ln_sbd, req_state); lkf = make_flags(gl, flags, req); @@ -279,8 +280,14 @@ static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state, * Submit the actual lock request. */ - return dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, strname, +again: + error = dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, strname, GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast); + if (error == -EBUSY) { + msleep(20); + goto again; + } + return error; } static void gdlm_put_lock(struct gfs2_glock *gl) @@ -312,8 +319,14 @@ static void gdlm_put_lock(struct gfs2_glock *gl) return; } +again: error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_VALBLK, NULL, gl); + if (error == -EBUSY) { + msleep(20); + goto again; + } + if (error) { fs_err(sdp, "gdlm_unlock %x,%llx err=%d\n", gl->gl_name.ln_type, From 1fc05c8d8426d4085a219c23f8855c4aaf9e3ffb Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher <agruenba@redhat.com> Date: Mon, 24 Jan 2022 12:23:55 -0500 Subject: [PATCH 04/15] gfs2: cancel timed-out glock requests The gfs2 evict code tries to upgrade the iopen glock from SH to EX. If the attempt to upgrade times out, gfs2 needs to tell dlm to cancel the lock request or it can deadlock. We also need to wake up the process waiting for the lock when dlm sends its AST back to gfs2. Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com> Signed-off-by: Bob Peterson <rpeterso@redhat.com> --- fs/gfs2/glock.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 6b23399eaee0c..d368d9a2e8f00 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -669,6 +669,8 @@ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret) /* Check for state != intended state */ if (unlikely(state != gl->gl_target)) { + if (gh && (ret & LM_OUT_CANCELED)) + gfs2_holder_wake(gh); if (gh && !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) { /* move to back of queue and try next entry */ if (ret & LM_OUT_CANCELED) { @@ -1691,6 +1693,14 @@ void gfs2_glock_dq(struct gfs2_holder *gh) struct gfs2_glock *gl = gh->gh_gl; spin_lock(&gl->gl_lockref.lock); + if (list_is_first(&gh->gh_list, &gl->gl_holders) && + !test_bit(HIF_HOLDER, &gh->gh_iflags)) { + spin_unlock(&gl->gl_lockref.lock); + gl->gl_name.ln_sbd->sd_lockstruct.ls_ops->lm_cancel(gl); + wait_on_bit(&gh->gh_iflags, HIF_WAIT, TASK_UNINTERRUPTIBLE); + spin_lock(&gl->gl_lockref.lock); + } + __gfs2_glock_dq(gh); spin_unlock(&gl->gl_lockref.lock); } From 29464ee36bcaaee2691249f49b9592b8d5c97ece Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher <agruenba@redhat.com> Date: Mon, 24 Jan 2022 12:23:57 -0500 Subject: [PATCH 05/15] gfs2: Switch lock order of inode and iopen glock This patch tries to fix the continual ABBA deadlocks we keep having between the iopen and inode glocks. This switches the lock order in gfs2_inode_lookup and gfs2_create_inode so the iopen glock is always locked first. Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com> Signed-off-by: Bob Peterson <rpeterso@redhat.com> --- fs/gfs2/inode.c | 49 +++++++++++++++++++++++++++---------------------- 1 file changed, 27 insertions(+), 22 deletions(-) diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 66a123306aecb..c8ec876f33ea3 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -131,7 +131,21 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_glock *io_gl; - error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl); + error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, + &ip->i_gl); + if (unlikely(error)) + goto fail; + + error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, + &io_gl); + if (unlikely(error)) + goto fail; + + if (blktype != GFS2_BLKST_UNLINKED) + gfs2_cancel_delete_work(io_gl); + error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, + &ip->i_iopen_gh); + gfs2_glock_put(io_gl); if (unlikely(error)) goto fail; @@ -161,16 +175,6 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, set_bit(GLF_INSTANTIATE_NEEDED, &ip->i_gl->gl_flags); - error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl); - if (unlikely(error)) - goto fail; - if (blktype != GFS2_BLKST_UNLINKED) - gfs2_cancel_delete_work(io_gl); - error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh); - gfs2_glock_put(io_gl); - if (unlikely(error)) - goto fail; - /* Lowest possible timestamp; will be overwritten in gfs2_dinode_in. */ inode->i_atime.tv_sec = 1LL << (8 * sizeof(inode->i_atime.tv_sec) - 1); inode->i_atime.tv_nsec = 0; @@ -716,13 +720,17 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, error = insert_inode_locked4(inode, ip->i_no_addr, iget_test, &ip->i_no_addr); BUG_ON(error); - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1); + error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh); if (error) goto fail_gunlock2; + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1); + if (error) + goto fail_gunlock3; + error = gfs2_trans_begin(sdp, blocks, 0); if (error) - goto fail_gunlock2; + goto fail_gunlock3; if (blocks > 1) { ip->i_eattr = ip->i_no_addr + 1; @@ -731,10 +739,6 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, init_dinode(dip, ip, symname); gfs2_trans_end(sdp); - error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh); - if (error) - goto fail_gunlock2; - glock_set_object(ip->i_gl, ip); glock_set_object(io_gl, ip); gfs2_set_iop(inode); @@ -745,14 +749,14 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (default_acl) { error = __gfs2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); if (error) - goto fail_gunlock3; + goto fail_gunlock4; posix_acl_release(default_acl); default_acl = NULL; } if (acl) { error = __gfs2_set_acl(inode, acl, ACL_TYPE_ACCESS); if (error) - goto fail_gunlock3; + goto fail_gunlock4; posix_acl_release(acl); acl = NULL; } @@ -760,11 +764,11 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, error = security_inode_init_security(&ip->i_inode, &dip->i_inode, name, &gfs2_initxattrs, NULL); if (error) - goto fail_gunlock3; + goto fail_gunlock4; error = link_dinode(dip, name, ip, &da); if (error) - goto fail_gunlock3; + goto fail_gunlock4; mark_inode_dirty(inode); d_instantiate(dentry, inode); @@ -782,9 +786,10 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, unlock_new_inode(inode); return error; -fail_gunlock3: +fail_gunlock4: glock_clear_object(ip->i_gl, ip); glock_clear_object(io_gl, ip); +fail_gunlock3: gfs2_glock_dq_uninit(&ip->i_iopen_gh); fail_gunlock2: gfs2_glock_put(io_gl); From 5a27a43efd1d5cd55dd5988cac596bdea7294d73 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher <agruenba@redhat.com> Date: Thu, 13 Jan 2022 02:54:55 +0100 Subject: [PATCH 06/15] gfs2: Make use of list_is_first Use list_is_first() instead of open-coding it. Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com> --- fs/gfs2/glock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index d368d9a2e8f00..a12bb3e8cb9cb 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -542,7 +542,7 @@ __acquires(&gl->gl_lockref.lock) * some reason. If this holder is the head of the list, it * means we have a blocked holder at the head, so return 1. */ - if (gh->gh_list.prev == &gl->gl_holders) + if (list_is_first(&gh->gh_list, &gl->gl_holders)) return 1; do_error(gl, 0); break; From a4e8145edcfd66b69056c59885770b428eccca59 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher <agruenba@redhat.com> Date: Fri, 14 Jan 2022 09:05:13 +0100 Subject: [PATCH 07/15] gfs2: Initialize gh_error in gfs2_glock_nq The gh_error field if a glock holder is initialized to zero in gfs2_holder_init(). When a locking operation fails, gh_error is set to an error code; when it succeeds, the gh_error value is left unchanged. The field isn't initialized in gfs2_holder_reinit(), which is a problem. Instead of fixing that directly, initialize gh_error in gfs2_glock_nq(). That also obsoletes the assignment in do_flock(). Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com> --- fs/gfs2/file.c | 1 - fs/gfs2/glock.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 5bac4f6e8e057..a743a9eb602f5 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -1497,7 +1497,6 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl) if (error != GLR_TRYFAILED) break; fl_gh->gh_flags = LM_FLAG_TRY | GL_EXACT; - fl_gh->gh_error = 0; msleep(sleeptime); } if (error) { diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index a12bb3e8cb9cb..630c6550eacfe 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1261,7 +1261,6 @@ void __gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, u16 flags, gh->gh_owner_pid = get_pid(task_pid(current)); gh->gh_state = state; gh->gh_flags = flags; - gh->gh_error = 0; gh->gh_iflags = 0; gfs2_glock_hold(gl); } @@ -1567,6 +1566,7 @@ int gfs2_glock_nq(struct gfs2_holder *gh) if (test_bit(GLF_LRU, &gl->gl_flags)) gfs2_glock_remove_from_lru(gl); + gh->gh_error = 0; spin_lock(&gl->gl_lockref.lock); add_to_queue(gh); if (unlikely((LM_FLAG_NOEXP & gh->gh_flags) && From b2963932346f0631c5145f656a22d4237f0956fa Mon Sep 17 00:00:00 2001 From: Bob Peterson <rpeterso@redhat.com> Date: Thu, 3 Mar 2022 07:32:43 -0500 Subject: [PATCH 08/15] gfs2: Remove return value for gfs2_indirect_init The return value from function gfs2_indirect_init is never used, so remove it. Signed-off-by: Bob Peterson <rpeterso@redhat.com> Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com> --- fs/gfs2/bmap.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index fbdb7a30470a3..39080b2d6cf86 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -606,9 +606,9 @@ static int gfs2_hole_size(struct inode *inode, sector_t lblock, u64 len, return ret; } -static inline __be64 *gfs2_indirect_init(struct metapath *mp, - struct gfs2_glock *gl, unsigned int i, - unsigned offset, u64 bn) +static inline void gfs2_indirect_init(struct metapath *mp, + struct gfs2_glock *gl, unsigned int i, + unsigned offset, u64 bn) { __be64 *ptr = (__be64 *)(mp->mp_bh[i - 1]->b_data + ((i > 1) ? sizeof(struct gfs2_meta_header) : @@ -621,7 +621,6 @@ static inline __be64 *gfs2_indirect_init(struct metapath *mp, gfs2_buffer_clear_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header)); ptr += offset; *ptr = cpu_to_be64(bn); - return ptr; } enum alloc_state { From bb7f5d96aaa87d5aee2f5eb98ae0b84f08988489 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher <agruenba@redhat.com> Date: Mon, 7 Mar 2022 23:30:03 +0100 Subject: [PATCH 09/15] gfs2: Fix should_fault_in_pages() logic Fix the fault-in window size logic: * Use a maximum window size of 1 MiB instead of BIO_MAX_VECS * PAGE_SIZE. The previous window size was always one page because the pages variable was accidentally being defined and then redefined in should_fault_in_pages(). * The nr_dirtied heuristic for guessing when there might be memory pressure often results in very small window sizes. Don't let nr_dirtied drop below 8 pages (as btrfs does). * Compute the window size in units of bytes, not pages. * Account for page overlap (unaligned iterators). Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com> --- fs/gfs2/file.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index a743a9eb602f5..0c8cf10281dab 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -775,8 +775,7 @@ static inline bool should_fault_in_pages(ssize_t ret, struct iov_iter *i, size_t *window_size) { size_t count = iov_iter_count(i); - char __user *p; - int pages = 1; + size_t size, offs; if (likely(!count)) return false; @@ -785,18 +784,20 @@ static inline bool should_fault_in_pages(ssize_t ret, struct iov_iter *i, if (!iter_is_iovec(i)) return false; + size = PAGE_SIZE; + offs = offset_in_page(i->iov[0].iov_base + i->iov_offset); if (*prev_count != count || !*window_size) { - int pages, nr_dirtied; + size_t nr_dirtied; - pages = min_t(int, BIO_MAX_VECS, DIV_ROUND_UP(count, PAGE_SIZE)); + size = ALIGN(offs + count, PAGE_SIZE); + size = min_t(size_t, size, SZ_1M); nr_dirtied = max(current->nr_dirtied_pause - - current->nr_dirtied, 1); - pages = min(pages, nr_dirtied); + current->nr_dirtied, 8); + size = min(size, nr_dirtied << PAGE_SHIFT); } *prev_count = count; - p = i->iov[0].iov_base + i->iov_offset; - *window_size = (size_t)PAGE_SIZE * pages - offset_in_page(p); + *window_size = size - offs; return true; } From 52f3f033a5dbd023307520af1ff551cadfd7f037 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher <agruenba@redhat.com> Date: Mon, 14 Mar 2022 18:32:02 +0100 Subject: [PATCH 10/15] gfs2: Disable page faults during lockless buffered reads During lockless buffered reads, filemap_read() holds page cache page references while trying to copy data to the user-space buffer. The calling process isn't holding the inode glock, but the page references it holds prevent those pages from being removed from the page cache, and that prevents the underlying inode glock from being moved to another node. Thus, we can end up in the same kinds of distributed deadlock situations as with normal (non-lockless) buffered reads. Fix that by disabling page faults during lockless reads as well. Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com> --- fs/gfs2/file.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 0c8cf10281dab..44132e4c87890 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -957,14 +957,16 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) return ret; iocb->ki_flags &= ~IOCB_DIRECT; } + pagefault_disable(); iocb->ki_flags |= IOCB_NOIO; ret = generic_file_read_iter(iocb, to); iocb->ki_flags &= ~IOCB_NOIO; + pagefault_enable(); if (ret >= 0) { if (!iov_iter_count(to)) return ret; written = ret; - } else { + } else if (ret != -EFAULT) { if (ret != -EAGAIN) return ret; if (iocb->ki_flags & IOCB_NOWAIT) From 124c458a401a2497f796e4f2d6cafac6edbea8e9 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher <agruenba@redhat.com> Date: Thu, 17 Mar 2022 14:20:38 +0100 Subject: [PATCH 11/15] gfs2: Minor retry logic cleanup Clean up the retry logic in the read and write functions somewhat. Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com> --- fs/gfs2/file.c | 34 ++++++++++++++++------------------ 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 44132e4c87890..44bb886eefce5 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -852,9 +852,9 @@ static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to, leftover = fault_in_iov_iter_writeable(to, window_size); gfs2_holder_disallow_demote(gh); if (leftover != window_size) { - if (!gfs2_holder_queued(gh)) - goto retry; - goto retry_under_glock; + if (gfs2_holder_queued(gh)) + goto retry_under_glock; + goto retry; } } if (gfs2_holder_queued(gh)) @@ -921,9 +921,9 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from, leftover = fault_in_iov_iter_readable(from, window_size); gfs2_holder_disallow_demote(gh); if (leftover != window_size) { - if (!gfs2_holder_queued(gh)) - goto retry; - goto retry_under_glock; + if (gfs2_holder_queued(gh)) + goto retry_under_glock; + goto retry; } } out: @@ -992,12 +992,11 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) leftover = fault_in_iov_iter_writeable(to, window_size); gfs2_holder_disallow_demote(&gh); if (leftover != window_size) { - if (!gfs2_holder_queued(&gh)) { - if (written) - goto out_uninit; - goto retry; - } - goto retry_under_glock; + if (gfs2_holder_queued(&gh)) + goto retry_under_glock; + if (written) + goto out_uninit; + goto retry; } } if (gfs2_holder_queued(&gh)) @@ -1071,12 +1070,11 @@ static ssize_t gfs2_file_buffered_write(struct kiocb *iocb, gfs2_holder_disallow_demote(gh); if (leftover != window_size) { from->count = min(from->count, window_size - leftover); - if (!gfs2_holder_queued(gh)) { - if (read) - goto out_uninit; - goto retry; - } - goto retry_under_glock; + if (gfs2_holder_queued(gh)) + goto retry_under_glock; + if (read) + goto out_uninit; + goto retry; } } out_unlock: From 46f3e0421ccb5474b5c006b0089b9dfd42534bb6 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher <agruenba@redhat.com> Date: Thu, 17 Mar 2022 14:47:24 +0100 Subject: [PATCH 12/15] gfs2: Fix gfs2_file_buffered_write endless loop workaround Since commit 554c577cee95b, gfs2_file_buffered_write() can accidentally return a truncated iov_iter, which might confuse callers. Fix that. Fixes: 554c577cee95b ("gfs2: Prevent endless loops in gfs2_file_buffered_write") Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com> --- fs/gfs2/file.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 44bb886eefce5..19a038bc33bc2 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -1084,6 +1084,7 @@ static ssize_t gfs2_file_buffered_write(struct kiocb *iocb, gfs2_holder_uninit(gh); if (statfs_gh) kfree(statfs_gh); + from->count = orig_count - read; return read ? read : ret; } From 11661835f90153bdfc5325e550d2b72d0f47cb3e Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher <agruenba@redhat.com> Date: Thu, 24 Mar 2022 22:40:20 +0100 Subject: [PATCH 13/15] gfs2: Remove dead code in gfs2_file_read_iter Function iomap_dio_rw() only returns -ENOTBLK for write requests and gfs2_file_direct_read() no longer returns -ENOTBLK since commit 1d45bb7f9d2a5 ("gfs2: Use iomap for stuffed direct I/O reads"), so there is no need to check for -ENOTBLK in gfs2_file_read_iter() anymore. Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com> --- fs/gfs2/file.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 19a038bc33bc2..edc588465a4b6 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -951,12 +951,9 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) * and retry. */ - if (iocb->ki_flags & IOCB_DIRECT) { - ret = gfs2_file_direct_read(iocb, to, &gh); - if (likely(ret != -ENOTBLK)) - return ret; - iocb->ki_flags &= ~IOCB_DIRECT; - } + if (iocb->ki_flags & IOCB_DIRECT) + return gfs2_file_direct_read(iocb, to, &gh); + pagefault_disable(); iocb->ki_flags |= IOCB_NOIO; ret = generic_file_read_iter(iocb, to); From 3bde4c48586074202044456285a97ccdf9048988 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher <agruenba@redhat.com> Date: Thu, 24 Mar 2022 23:13:26 +0100 Subject: [PATCH 14/15] gfs2: Make sure not to return short direct writes When direct writes fail with -ENOTBLK because we're writing into a hole (gfs2_iomap_begin()) or because of a page invalidation failure (iomap_dio_rw()), we're falling back to buffered writes. In that case, when we lose the inode glock in gfs2_file_buffered_write(), we want to re-acquire it instead of returning a short write. Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com> --- fs/gfs2/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index edc588465a4b6..22b41acfbbc39 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -1069,7 +1069,7 @@ static ssize_t gfs2_file_buffered_write(struct kiocb *iocb, from->count = min(from->count, window_size - leftover); if (gfs2_holder_queued(gh)) goto retry_under_glock; - if (read) + if (read && !(iocb->ki_flags & IOCB_DIRECT)) goto out_uninit; goto retry; } From 27ca8273fda398638ca994a207323a85b6d81190 Mon Sep 17 00:00:00 2001 From: Andrew Price <anprice@redhat.com> Date: Tue, 22 Mar 2022 19:05:51 +0000 Subject: [PATCH 15/15] gfs2: Make sure FITRIM minlen is rounded up to fs block size Per fstrim(8) we must round up the minlen argument to the fs block size. The current calculation doesn't take into account devices that have a discard granularity and requested minlen less than 1 fs block, so the value can get shifted away to zero in the translation to fs blocks. The zero minlen passed to gfs2_rgrp_send_discards() then allows sb_issue_discard() to be called with nr_sects == 0 which returns -EINVAL and results in gfs2_rgrp_send_discards() returning -EIO. Make sure minlen is never < 1 fs block by taking the max of the requested minlen and the fs block size before comparing to the device's discard granularity and shifting to fs blocks. Fixes: 076f0faa764ab ("GFS2: Fix FITRIM argument handling") Signed-off-by: Andrew Price <anprice@redhat.com> Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com> --- fs/gfs2/rgrp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 9df8a84f85a64..801ad9f4f2bef 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1417,7 +1417,8 @@ int gfs2_fitrim(struct file *filp, void __user *argp) start = r.start >> bs_shift; end = start + (r.len >> bs_shift); - minlen = max_t(u64, r.minlen, + minlen = max_t(u64, r.minlen, sdp->sd_sb.sb_bsize); + minlen = max_t(u64, minlen, q->limits.discard_granularity) >> bs_shift; if (end <= start || minlen > sdp->sd_max_rg_data)