From c2f1c4bd20621175c581f298b4943df0cffbd841 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 13 Oct 2021 16:44:20 -0400 Subject: [PATCH 01/51] NFSD: Fix sparse warning /home/cel/src/linux/linux/fs/nfsd/nfs4proc.c:1539:24: warning: incorrect type in assignment (different base types) /home/cel/src/linux/linux/fs/nfsd/nfs4proc.c:1539:24: expected restricted __be32 [usertype] status /home/cel/src/linux/linux/fs/nfsd/nfs4proc.c:1539:24: got int Signed-off-by: Chuck Lever --- fs/nfsd/nfs4proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index a36261f89bdfa..a6dc5e18c498c 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1514,7 +1514,7 @@ static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy) u64 bytes_total = copy->cp_count; u64 src_pos = copy->cp_src_pos; u64 dst_pos = copy->cp_dst_pos; - __be32 status; + int status; /* See RFC 7862 p.67: */ if (bytes_total == 0) From 89b24336f03a8ba560e96b0c47a8434a7fa48e3c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 29 Nov 2021 15:51:25 +1100 Subject: [PATCH 02/51] NFSD: handle errors better in write_ports_addfd() If write_ports_add() fails, we shouldn't destroy the serv, unless we had only just created it. So if there are any permanent sockets already attached, leave the serv in place. Signed-off-by: NeilBrown Signed-off-by: Chuck Lever --- fs/nfsd/nfsctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 51a49e0cfe376..bf4c9996ad926 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -742,7 +742,7 @@ static ssize_t __write_ports_addfd(char *buf, struct net *net, const struct cred return err; err = svc_addsock(nn->nfsd_serv, fd, buf, SIMPLE_TRANSACTION_LIMIT, cred); - if (err < 0) { + if (err < 0 && list_empty(&nn->nfsd_serv->sv_permsocks)) { nfsd_destroy(net); return err; } From df5e49c880ea0776806b8a9f8ab95e035272cf6f Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 29 Nov 2021 15:51:25 +1100 Subject: [PATCH 03/51] SUNRPC: change svc_get() to return the svc. It is common for 'get' functions to return the object that was 'got', and there are a couple of places where users of svc_get() would be a little simpler if svc_get() did that. Make it so. Signed-off-by: NeilBrown Signed-off-by: Chuck Lever --- fs/lockd/svc.c | 6 ++---- fs/nfs/callback.c | 6 ++---- include/linux/sunrpc/svc.h | 3 ++- 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index b220e1b917268..2f50d5b2a8a42 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -430,14 +430,12 @@ static struct svc_serv *lockd_create_svc(void) /* * Check whether we're already up and running. */ - if (nlmsvc_rqst) { + if (nlmsvc_rqst) /* * Note: increase service usage, because later in case of error * svc_destroy() will be called. */ - svc_get(nlmsvc_rqst->rq_server); - return nlmsvc_rqst->rq_server; - } + return svc_get(nlmsvc_rqst->rq_server); /* * Sanity check: if there's no pid, diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 86d856de1389b..6e5e742a42b8b 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -266,14 +266,12 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion) /* * Check whether we're already up and running. */ - if (cb_info->serv) { + if (cb_info->serv) /* * Note: increase service usage, because later in case of error * svc_destroy() will be called. */ - svc_get(cb_info->serv); - return cb_info->serv; - } + return svc_get(cb_info->serv); switch (minorversion) { case 0: diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 0ae28ae6caf20..5d9568953fcd8 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -120,9 +120,10 @@ struct svc_serv { * change the number of threads. Horrible, but there it is. * Should be called with the "service mutex" held. */ -static inline void svc_get(struct svc_serv *serv) +static inline struct svc_serv *svc_get(struct svc_serv *serv) { serv->sv_nrthreads++; + return serv; } /* From 8c62d12740a1450d2e8456d5747f440e10db281a Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 29 Nov 2021 15:51:25 +1100 Subject: [PATCH 04/51] SUNRPC/NFSD: clean up get/put functions. svc_destroy() is poorly named - it doesn't necessarily destroy the svc, it might just reduce the ref count. nfsd_destroy() is poorly named for the same reason. This patch: - removes the refcount functionality from svc_destroy(), moving it to a new svc_put(). Almost all previous callers of svc_destroy() now call svc_put(). - renames nfsd_destroy() to nfsd_put() and improves the code, using the new svc_destroy() rather than svc_put() - removes a few comments that explain the important for balanced get/put calls. This should be obvious. The only non-trivial part of this is that svc_destroy() would call svc_sock_update() on a non-final decrement. It can no longer do that, and svc_put() isn't really a good place of it. This call is now made from svc_exit_thread() which seems like a good place. This makes the call *before* sv_nrthreads is decremented rather than after. This is not particularly important as the call just sets a flag which causes sv_nrthreads set be checked later. A subsequent patch will improve the ordering. Signed-off-by: NeilBrown Signed-off-by: Chuck Lever --- fs/lockd/svc.c | 6 +----- fs/nfs/callback.c | 14 ++------------ fs/nfsd/nfsctl.c | 4 ++-- fs/nfsd/nfsd.h | 2 +- fs/nfsd/nfssvc.c | 30 ++++++++++++++++-------------- include/linux/sunrpc/svc.h | 26 +++++++++++++++++++++++--- net/sunrpc/svc.c | 19 +++++-------------- 7 files changed, 50 insertions(+), 51 deletions(-) diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 2f50d5b2a8a42..135bd86ed3adb 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -431,10 +431,6 @@ static struct svc_serv *lockd_create_svc(void) * Check whether we're already up and running. */ if (nlmsvc_rqst) - /* - * Note: increase service usage, because later in case of error - * svc_destroy() will be called. - */ return svc_get(nlmsvc_rqst->rq_server); /* @@ -495,7 +491,7 @@ int lockd_up(struct net *net, const struct cred *cred) * so we exit through here on both success and failure. */ err_put: - svc_destroy(serv); + svc_put(serv); err_create: mutex_unlock(&nlmsvc_mutex); return error; diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 6e5e742a42b8b..edbc7579b4aae 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -267,10 +267,6 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion) * Check whether we're already up and running. */ if (cb_info->serv) - /* - * Note: increase service usage, because later in case of error - * svc_destroy() will be called. - */ return svc_get(cb_info->serv); switch (minorversion) { @@ -333,16 +329,10 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt) goto err_start; cb_info->users++; - /* - * svc_create creates the svc_serv with sv_nrthreads == 1, and then - * svc_prepare_thread increments that. So we need to call svc_destroy - * on both success and failure so that the refcount is 1 when the - * thread exits. - */ err_net: if (!cb_info->users) cb_info->serv = NULL; - svc_destroy(serv); + svc_put(serv); err_create: mutex_unlock(&nfs_callback_mutex); return ret; @@ -368,7 +358,7 @@ void nfs_callback_down(int minorversion, struct net *net) if (cb_info->users == 0) { svc_get(serv); serv->sv_ops->svo_setup(serv, NULL, 0); - svc_destroy(serv); + svc_put(serv); dprintk("nfs_callback_down: service destroyed\n"); cb_info->serv = NULL; } diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index bf4c9996ad926..17521fada83f6 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -743,7 +743,7 @@ static ssize_t __write_ports_addfd(char *buf, struct net *net, const struct cred err = svc_addsock(nn->nfsd_serv, fd, buf, SIMPLE_TRANSACTION_LIMIT, cred); if (err < 0 && list_empty(&nn->nfsd_serv->sv_permsocks)) { - nfsd_destroy(net); + nfsd_put(net); return err; } @@ -796,7 +796,7 @@ static ssize_t __write_ports_addxprt(char *buf, struct net *net, const struct cr if (!list_empty(&nn->nfsd_serv->sv_permsocks)) nn->nfsd_serv->sv_nrthreads--; else - nfsd_destroy(net); + nfsd_put(net); return err; } diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 498e5a4898260..3e5008b475ff0 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -97,7 +97,7 @@ int nfsd_pool_stats_open(struct inode *, struct file *); int nfsd_pool_stats_release(struct inode *, struct file *); void nfsd_shutdown_threads(struct net *net); -void nfsd_destroy(struct net *net); +void nfsd_put(struct net *net); bool i_am_nfsd(void); diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 80431921e5d79..a0a7564e6c73e 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -623,7 +623,7 @@ void nfsd_shutdown_threads(struct net *net) svc_get(serv); /* Kill outstanding nfsd threads */ serv->sv_ops->svo_setup(serv, NULL, 0); - nfsd_destroy(net); + nfsd_put(net); mutex_unlock(&nfsd_mutex); /* Wait for shutdown of nfsd_serv to complete */ wait_for_completion(&nn->nfsd_shutdown_complete); @@ -656,7 +656,10 @@ int nfsd_create_serv(struct net *net) nn->nfsd_serv->sv_maxconn = nn->max_connections; error = svc_bind(nn->nfsd_serv, net); if (error < 0) { - svc_destroy(nn->nfsd_serv); + /* NOT nfsd_put() as notifiers (see below) haven't + * been set up yet. + */ + svc_put(nn->nfsd_serv); nfsd_complete_shutdown(net); return error; } @@ -697,16 +700,16 @@ int nfsd_get_nrthreads(int n, int *nthreads, struct net *net) return 0; } -void nfsd_destroy(struct net *net) +void nfsd_put(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); - int destroy = (nn->nfsd_serv->sv_nrthreads == 1); - if (destroy) + nn->nfsd_serv->sv_nrthreads -= 1; + if (nn->nfsd_serv->sv_nrthreads == 0) { svc_shutdown_net(nn->nfsd_serv, net); - svc_destroy(nn->nfsd_serv); - if (destroy) + svc_destroy(nn->nfsd_serv); nfsd_complete_shutdown(net); + } } int nfsd_set_nrthreads(int n, int *nthreads, struct net *net) @@ -758,7 +761,7 @@ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net) if (err) break; } - nfsd_destroy(net); + nfsd_put(net); return err; } @@ -795,7 +798,7 @@ nfsd_svc(int nrservs, struct net *net, const struct cred *cred) error = nfsd_startup_net(net, cred); if (error) - goto out_destroy; + goto out_put; error = nn->nfsd_serv->sv_ops->svo_setup(nn->nfsd_serv, NULL, nrservs); if (error) @@ -808,8 +811,8 @@ nfsd_svc(int nrservs, struct net *net, const struct cred *cred) out_shutdown: if (error < 0 && !nfsd_up_before) nfsd_shutdown_net(net); -out_destroy: - nfsd_destroy(net); /* Release server */ +out_put: + nfsd_put(net); out: mutex_unlock(&nfsd_mutex); return error; @@ -982,7 +985,7 @@ nfsd(void *vrqstp) /* Release the thread */ svc_exit_thread(rqstp); - nfsd_destroy(net); + nfsd_put(net); /* Release module */ mutex_unlock(&nfsd_mutex); @@ -1109,8 +1112,7 @@ int nfsd_pool_stats_release(struct inode *inode, struct file *file) struct net *net = inode->i_sb->s_fs_info; mutex_lock(&nfsd_mutex); - /* this function really, really should have been called svc_put() */ - nfsd_destroy(net); + nfsd_put(net); mutex_unlock(&nfsd_mutex); return ret; } diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 5d9568953fcd8..73d56d33a36d9 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -114,8 +114,13 @@ struct svc_serv { #endif /* CONFIG_SUNRPC_BACKCHANNEL */ }; -/* - * We use sv_nrthreads as a reference count. svc_destroy() drops +/** + * svc_get() - increment reference count on a SUNRPC serv + * @serv: the svc_serv to have count incremented + * + * Returns: the svc_serv that was passed in. + * + * We use sv_nrthreads as a reference count. svc_put() drops * this refcount, so we need to bump it up around operations that * change the number of threads. Horrible, but there it is. * Should be called with the "service mutex" held. @@ -126,6 +131,22 @@ static inline struct svc_serv *svc_get(struct svc_serv *serv) return serv; } +void svc_destroy(struct svc_serv *serv); + +/** + * svc_put - decrement reference count on a SUNRPC serv + * @serv: the svc_serv to have count decremented + * + * When the reference count reaches zero, svc_destroy() + * is called to clean up and free the serv. + */ +static inline void svc_put(struct svc_serv *serv) +{ + serv->sv_nrthreads -= 1; + if (serv->sv_nrthreads == 0) + svc_destroy(serv); +} + /* * Maximum payload size supported by a kernel RPC server. * This is use to determine the max number of pages nfsd is @@ -515,7 +536,6 @@ struct svc_serv * svc_create_pooled(struct svc_program *, unsigned int, int svc_set_num_threads(struct svc_serv *, struct svc_pool *, int); int svc_set_num_threads_sync(struct svc_serv *, struct svc_pool *, int); int svc_pool_stats_open(struct svc_serv *serv, struct file *file); -void svc_destroy(struct svc_serv *); void svc_shutdown_net(struct svc_serv *, struct net *); int svc_process(struct svc_rqst *); int bc_svc_process(struct svc_serv *, struct rpc_rqst *, diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 4292278a95526..55a1bf0d129f1 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -528,17 +528,7 @@ EXPORT_SYMBOL_GPL(svc_shutdown_net); void svc_destroy(struct svc_serv *serv) { - dprintk("svc: svc_destroy(%s, %d)\n", - serv->sv_program->pg_name, - serv->sv_nrthreads); - - if (serv->sv_nrthreads) { - if (--(serv->sv_nrthreads) != 0) { - svc_sock_update_bufs(serv); - return; - } - } else - printk("svc_destroy: no threads for serv=%p!\n", serv); + dprintk("svc: svc_destroy(%s)\n", serv->sv_program->pg_name); del_timer_sync(&serv->sv_temptimer); @@ -892,9 +882,10 @@ svc_exit_thread(struct svc_rqst *rqstp) svc_rqst_free(rqstp); - /* Release the server */ - if (serv) - svc_destroy(serv); + if (!serv) + return; + svc_sock_update_bufs(serv); + svc_destroy(serv); } EXPORT_SYMBOL_GPL(svc_exit_thread); From ec52361df99b490f6af412b046df9799b92c1050 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 29 Nov 2021 15:51:25 +1100 Subject: [PATCH 05/51] SUNRPC: stop using ->sv_nrthreads as a refcount The use of sv_nrthreads as a general refcount results in clumsy code, as is seen by various comments needed to explain the situation. This patch introduces a 'struct kref' and uses that for reference counting, leaving sv_nrthreads to be a pure count of threads. The kref is managed particularly in svc_get() and svc_put(), and also nfsd_put(); svc_destroy() now takes a pointer to the embedded kref, rather than to the serv. nfsd allows the svc_serv to exist with ->sv_nrhtreads being zero. This happens when a transport is created before the first thread is started. To support this, a 'keep_active' flag is introduced which holds a ref on the svc_serv. This is set when any listening socket is successfully added (unless there are running threads), and cleared when the number of threads is set. So when the last thread exits, the nfs_serv will be destroyed. The use of 'keep_active' replaces previous code which checked if there were any permanent sockets. We no longer clear ->rq_server when nfsd() exits. This was done to prevent svc_exit_thread() from calling svc_destroy(). Instead we take an extra reference to the svc_serv to prevent svc_destroy() from being called. Signed-off-by: NeilBrown Signed-off-by: Chuck Lever --- fs/lockd/svc.c | 4 ---- fs/nfs/callback.c | 2 +- fs/nfsd/netns.h | 7 +++++++ fs/nfsd/nfsctl.c | 22 +++++++++----------- fs/nfsd/nfssvc.c | 42 +++++++++++++++++++++++--------------- include/linux/sunrpc/svc.h | 14 ++++--------- net/sunrpc/svc.c | 22 ++++++++++---------- 7 files changed, 59 insertions(+), 54 deletions(-) diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 135bd86ed3adb..a9669b106dbde 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -486,10 +486,6 @@ int lockd_up(struct net *net, const struct cred *cred) goto err_put; } nlmsvc_users++; - /* - * Note: svc_serv structures have an initial use count of 1, - * so we exit through here on both success and failure. - */ err_put: svc_put(serv); err_create: diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index edbc7579b4aae..d9d78ffd1d653 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -169,7 +169,7 @@ static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt, if (nrservs < NFS4_MIN_NR_CALLBACK_THREADS) nrservs = NFS4_MIN_NR_CALLBACK_THREADS; - if (serv->sv_nrthreads-1 == nrservs) + if (serv->sv_nrthreads == nrservs) return 0; ret = serv->sv_ops->svo_setup(serv, NULL, nrservs); diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 935c1028c2175..08bcd8f23b013 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -123,6 +123,13 @@ struct nfsd_net { u32 clverifier_counter; struct svc_serv *nfsd_serv; + /* When a listening socket is added to nfsd, keep_active is set + * and this justifies a reference on nfsd_serv. This stops + * nfsd_serv from being freed. When the number of threads is + * set, keep_active is cleared and the reference is dropped. So + * when the last thread exits, the service will be destroyed. + */ + int keep_active; wait_queue_head_t ntf_wq; atomic_t ntf_refcnt; diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 17521fada83f6..7b557eb8211a0 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -742,13 +742,12 @@ static ssize_t __write_ports_addfd(char *buf, struct net *net, const struct cred return err; err = svc_addsock(nn->nfsd_serv, fd, buf, SIMPLE_TRANSACTION_LIMIT, cred); - if (err < 0 && list_empty(&nn->nfsd_serv->sv_permsocks)) { - nfsd_put(net); - return err; - } - /* Decrease the count, but don't shut down the service */ - nn->nfsd_serv->sv_nrthreads--; + if (err >= 0 && + !nn->nfsd_serv->sv_nrthreads && !xchg(&nn->keep_active, 1)) + svc_get(nn->nfsd_serv); + + nfsd_put(net); return err; } @@ -783,8 +782,10 @@ static ssize_t __write_ports_addxprt(char *buf, struct net *net, const struct cr if (err < 0 && err != -EAFNOSUPPORT) goto out_close; - /* Decrease the count, but don't shut down the service */ - nn->nfsd_serv->sv_nrthreads--; + if (!nn->nfsd_serv->sv_nrthreads && !xchg(&nn->keep_active, 1)) + svc_get(nn->nfsd_serv); + + nfsd_put(net); return 0; out_close: xprt = svc_find_xprt(nn->nfsd_serv, transport, net, PF_INET, port); @@ -793,10 +794,7 @@ static ssize_t __write_ports_addxprt(char *buf, struct net *net, const struct cr svc_xprt_put(xprt); } out_err: - if (!list_empty(&nn->nfsd_serv->sv_permsocks)) - nn->nfsd_serv->sv_nrthreads--; - else - nfsd_put(net); + nfsd_put(net); return err; } diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index a0a7564e6c73e..5f605e7e80915 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -60,13 +60,13 @@ static __be32 nfsd_init_request(struct svc_rqst *, * extent ->sv_temp_socks and ->sv_permsocks. It also protects nfsdstats.th_cnt * * If (out side the lock) nn->nfsd_serv is non-NULL, then it must point to a - * properly initialised 'struct svc_serv' with ->sv_nrthreads > 0. That number - * of nfsd threads must exist and each must listed in ->sp_all_threads in each - * entry of ->sv_pools[]. + * properly initialised 'struct svc_serv' with ->sv_nrthreads > 0 (unless + * nn->keep_active is set). That number of nfsd threads must + * exist and each must be listed in ->sp_all_threads in some entry of + * ->sv_pools[]. * - * Transitions of the thread count between zero and non-zero are of particular - * interest since the svc_serv needs to be created and initialized at that - * point, or freed. + * Each active thread holds a counted reference on nn->nfsd_serv, as does + * the nn->keep_active flag and various transient calls to svc_get(). * * Finally, the nfsd_mutex also protects some of the global variables that are * accessed when nfsd starts and that are settable via the write_* routines in @@ -700,14 +700,22 @@ int nfsd_get_nrthreads(int n, int *nthreads, struct net *net) return 0; } +/* This is the callback for kref_put() below. + * There is no code here as the first thing to be done is + * call svc_shutdown_net(), but we cannot get the 'net' from + * the kref. So do all the work when kref_put returns true. + */ +static void nfsd_noop(struct kref *ref) +{ +} + void nfsd_put(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); - nn->nfsd_serv->sv_nrthreads -= 1; - if (nn->nfsd_serv->sv_nrthreads == 0) { + if (kref_put(&nn->nfsd_serv->sv_refcnt, nfsd_noop)) { svc_shutdown_net(nn->nfsd_serv, net); - svc_destroy(nn->nfsd_serv); + svc_destroy(&nn->nfsd_serv->sv_refcnt); nfsd_complete_shutdown(net); } } @@ -803,15 +811,14 @@ nfsd_svc(int nrservs, struct net *net, const struct cred *cred) NULL, nrservs); if (error) goto out_shutdown; - /* We are holding a reference to nn->nfsd_serv which - * we don't want to count in the return value, - * so subtract 1 - */ - error = nn->nfsd_serv->sv_nrthreads - 1; + error = nn->nfsd_serv->sv_nrthreads; out_shutdown: if (error < 0 && !nfsd_up_before) nfsd_shutdown_net(net); out_put: + /* Threads now hold service active */ + if (xchg(&nn->keep_active, 0)) + nfsd_put(net); nfsd_put(net); out: mutex_unlock(&nfsd_mutex); @@ -980,11 +987,15 @@ nfsd(void *vrqstp) nfsdstats.th_cnt --; out: - rqstp->rq_server = NULL; + /* Take an extra ref so that the svc_put in svc_exit_thread() + * doesn't call svc_destroy() + */ + svc_get(nn->nfsd_serv); /* Release the thread */ svc_exit_thread(rqstp); + /* Now if needed we call svc_destroy in appropriate context */ nfsd_put(net); /* Release module */ @@ -1099,7 +1110,6 @@ int nfsd_pool_stats_open(struct inode *inode, struct file *file) mutex_unlock(&nfsd_mutex); return -ENODEV; } - /* bump up the psudo refcount while traversing */ svc_get(nn->nfsd_serv); ret = svc_pool_stats_open(nn->nfsd_serv, file); mutex_unlock(&nfsd_mutex); diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 73d56d33a36d9..3903b4ae8ac53 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -85,6 +85,7 @@ struct svc_serv { struct svc_program * sv_program; /* RPC program */ struct svc_stat * sv_stats; /* RPC statistics */ spinlock_t sv_lock; + struct kref sv_refcnt; unsigned int sv_nrthreads; /* # of server threads */ unsigned int sv_maxconn; /* max connections allowed or * '0' causing max to be based @@ -119,19 +120,14 @@ struct svc_serv { * @serv: the svc_serv to have count incremented * * Returns: the svc_serv that was passed in. - * - * We use sv_nrthreads as a reference count. svc_put() drops - * this refcount, so we need to bump it up around operations that - * change the number of threads. Horrible, but there it is. - * Should be called with the "service mutex" held. */ static inline struct svc_serv *svc_get(struct svc_serv *serv) { - serv->sv_nrthreads++; + kref_get(&serv->sv_refcnt); return serv; } -void svc_destroy(struct svc_serv *serv); +void svc_destroy(struct kref *); /** * svc_put - decrement reference count on a SUNRPC serv @@ -142,9 +138,7 @@ void svc_destroy(struct svc_serv *serv); */ static inline void svc_put(struct svc_serv *serv) { - serv->sv_nrthreads -= 1; - if (serv->sv_nrthreads == 0) - svc_destroy(serv); + kref_put(&serv->sv_refcnt, svc_destroy); } /* diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 55a1bf0d129f1..acddc6e12e9e1 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -435,7 +435,7 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, return NULL; serv->sv_name = prog->pg_name; serv->sv_program = prog; - serv->sv_nrthreads = 1; + kref_init(&serv->sv_refcnt); serv->sv_stats = prog->pg_stats; if (bufsize > RPCSVC_MAXPAYLOAD) bufsize = RPCSVC_MAXPAYLOAD; @@ -526,10 +526,11 @@ EXPORT_SYMBOL_GPL(svc_shutdown_net); * protect the sv_nrthreads, sv_permsocks and sv_tempsocks. */ void -svc_destroy(struct svc_serv *serv) +svc_destroy(struct kref *ref) { - dprintk("svc: svc_destroy(%s)\n", serv->sv_program->pg_name); + struct svc_serv *serv = container_of(ref, struct svc_serv, sv_refcnt); + dprintk("svc: svc_destroy(%s)\n", serv->sv_program->pg_name); del_timer_sync(&serv->sv_temptimer); /* @@ -637,6 +638,7 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node) if (!rqstp) return ERR_PTR(-ENOMEM); + svc_get(serv); serv->sv_nrthreads++; spin_lock_bh(&pool->sp_lock); pool->sp_nrthreads++; @@ -776,8 +778,7 @@ int svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) { if (pool == NULL) { - /* The -1 assumes caller has done a svc_get() */ - nrservs -= (serv->sv_nrthreads-1); + nrservs -= serv->sv_nrthreads; } else { spin_lock_bh(&pool->sp_lock); nrservs -= pool->sp_nrthreads; @@ -814,8 +815,7 @@ int svc_set_num_threads_sync(struct svc_serv *serv, struct svc_pool *pool, int nrservs) { if (pool == NULL) { - /* The -1 assumes caller has done a svc_get() */ - nrservs -= (serv->sv_nrthreads-1); + nrservs -= serv->sv_nrthreads; } else { spin_lock_bh(&pool->sp_lock); nrservs -= pool->sp_nrthreads; @@ -880,12 +880,12 @@ svc_exit_thread(struct svc_rqst *rqstp) list_del_rcu(&rqstp->rq_all); spin_unlock_bh(&pool->sp_lock); + serv->sv_nrthreads -= 1; + svc_sock_update_bufs(serv); + svc_rqst_free(rqstp); - if (!serv) - return; - svc_sock_update_bufs(serv); - svc_destroy(serv); + svc_put(serv); } EXPORT_SYMBOL_GPL(svc_exit_thread); From 9b6c8c9bebccd5fb785c306b948c08874a88874d Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 29 Nov 2021 15:51:25 +1100 Subject: [PATCH 06/51] nfsd: make nfsd_stats.th_cnt atomic_t This allows us to move the updates for th_cnt out of the mutex. This is a step towards reducing mutex coverage in nfsd(). Signed-off-by: NeilBrown Signed-off-by: Chuck Lever --- fs/nfsd/nfssvc.c | 6 +++--- fs/nfsd/stats.c | 2 +- fs/nfsd/stats.h | 4 +--- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 5f605e7e80915..fc5899502a837 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -57,7 +57,7 @@ static __be32 nfsd_init_request(struct svc_rqst *, /* * nfsd_mutex protects nn->nfsd_serv -- both the pointer itself and the members * of the svc_serv struct. In particular, ->sv_nrthreads but also to some - * extent ->sv_temp_socks and ->sv_permsocks. It also protects nfsdstats.th_cnt + * extent ->sv_temp_socks and ->sv_permsocks. * * If (out side the lock) nn->nfsd_serv is non-NULL, then it must point to a * properly initialised 'struct svc_serv' with ->sv_nrthreads > 0 (unless @@ -955,8 +955,8 @@ nfsd(void *vrqstp) allow_signal(SIGINT); allow_signal(SIGQUIT); - nfsdstats.th_cnt++; mutex_unlock(&nfsd_mutex); + atomic_inc(&nfsdstats.th_cnt); set_freezable(); @@ -983,8 +983,8 @@ nfsd(void *vrqstp) /* Clear signals before calling svc_exit_thread() */ flush_signals(current); + atomic_dec(&nfsdstats.th_cnt); mutex_lock(&nfsd_mutex); - nfsdstats.th_cnt --; out: /* Take an extra ref so that the svc_put in svc_exit_thread() diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c index 1d3b881e73821..a8c5a02a84f04 100644 --- a/fs/nfsd/stats.c +++ b/fs/nfsd/stats.c @@ -45,7 +45,7 @@ static int nfsd_proc_show(struct seq_file *seq, void *v) percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_IO_WRITE])); /* thread usage: */ - seq_printf(seq, "th %u 0", nfsdstats.th_cnt); + seq_printf(seq, "th %u 0", atomic_read(&nfsdstats.th_cnt)); /* deprecated thread usage histogram stats */ for (i = 0; i < 10; i++) diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h index 51ecda852e23b..9b43dc3d99913 100644 --- a/fs/nfsd/stats.h +++ b/fs/nfsd/stats.h @@ -29,11 +29,9 @@ enum { struct nfsd_stats { struct percpu_counter counter[NFSD_STATS_COUNTERS_NUM]; - /* Protected by nfsd_mutex */ - unsigned int th_cnt; /* number of available threads */ + atomic_t th_cnt; /* number of available threads */ }; - extern struct nfsd_stats nfsdstats; extern struct svc_stat nfsd_svcstats; From 2a36395fac3b72771f87c3ee4387e3a96d85a7cc Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 29 Nov 2021 15:51:25 +1100 Subject: [PATCH 07/51] SUNRPC: use sv_lock to protect updates to sv_nrthreads. Using sv_lock means we don't need to hold the service mutex over these updates. In particular, svc_exit_thread() no longer requires synchronisation, so threads can exit asynchronously. Note that we could use an atomic_t, but as there are many more read sites than writes, that would add unnecessary noise to the code. Some reads are already racy, and there is no need for them to not be. Signed-off-by: NeilBrown Signed-off-by: Chuck Lever --- fs/nfsd/nfssvc.c | 5 ++--- net/sunrpc/svc.c | 9 +++++++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index fc5899502a837..e9c9fa820b170 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -55,9 +55,8 @@ static __be32 nfsd_init_request(struct svc_rqst *, struct svc_process_info *); /* - * nfsd_mutex protects nn->nfsd_serv -- both the pointer itself and the members - * of the svc_serv struct. In particular, ->sv_nrthreads but also to some - * extent ->sv_temp_socks and ->sv_permsocks. + * nfsd_mutex protects nn->nfsd_serv -- both the pointer itself and some members + * of the svc_serv struct such as ->sv_temp_socks and ->sv_permsocks. * * If (out side the lock) nn->nfsd_serv is non-NULL, then it must point to a * properly initialised 'struct svc_serv' with ->sv_nrthreads > 0 (unless diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index acddc6e12e9e1..2b2042234e4bb 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -523,7 +523,7 @@ EXPORT_SYMBOL_GPL(svc_shutdown_net); /* * Destroy an RPC service. Should be called with appropriate locking to - * protect the sv_nrthreads, sv_permsocks and sv_tempsocks. + * protect sv_permsocks and sv_tempsocks. */ void svc_destroy(struct kref *ref) @@ -639,7 +639,10 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node) return ERR_PTR(-ENOMEM); svc_get(serv); - serv->sv_nrthreads++; + spin_lock_bh(&serv->sv_lock); + serv->sv_nrthreads += 1; + spin_unlock_bh(&serv->sv_lock); + spin_lock_bh(&pool->sp_lock); pool->sp_nrthreads++; list_add_rcu(&rqstp->rq_all, &pool->sp_all_threads); @@ -880,7 +883,9 @@ svc_exit_thread(struct svc_rqst *rqstp) list_del_rcu(&rqstp->rq_all); spin_unlock_bh(&pool->sp_lock); + spin_lock_bh(&serv->sv_lock); serv->sv_nrthreads -= 1; + spin_unlock_bh(&serv->sv_lock); svc_sock_update_bufs(serv); svc_rqst_free(rqstp); From 9d3792aefdcda71d20c2b1ecc589c17ae71eb523 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 29 Nov 2021 15:51:25 +1100 Subject: [PATCH 08/51] NFSD: narrow nfsd_mutex protection in nfsd thread There is nothing happening in the start of nfsd() that requires protection by the mutex, so don't take it until shutting down the thread - which does still require protection - but only for nfsd_put(). Signed-off-by: NeilBrown Signed-off-by: Chuck Lever --- fs/nfsd/nfssvc.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index e9c9fa820b170..097abd8b059c5 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -932,9 +932,6 @@ nfsd(void *vrqstp) struct nfsd_net *nn = net_generic(net, nfsd_net_id); int err; - /* Lock module and set up kernel thread */ - mutex_lock(&nfsd_mutex); - /* At this point, the thread shares current->fs * with the init process. We need to create files with the * umask as defined by the client instead of init's umask. */ @@ -954,7 +951,6 @@ nfsd(void *vrqstp) allow_signal(SIGINT); allow_signal(SIGQUIT); - mutex_unlock(&nfsd_mutex); atomic_inc(&nfsdstats.th_cnt); set_freezable(); @@ -983,7 +979,6 @@ nfsd(void *vrqstp) flush_signals(current); atomic_dec(&nfsdstats.th_cnt); - mutex_lock(&nfsd_mutex); out: /* Take an extra ref so that the svc_put in svc_exit_thread() @@ -995,10 +990,11 @@ nfsd(void *vrqstp) svc_exit_thread(rqstp); /* Now if needed we call svc_destroy in appropriate context */ + mutex_lock(&nfsd_mutex); nfsd_put(net); + mutex_unlock(&nfsd_mutex); /* Release module */ - mutex_unlock(&nfsd_mutex); module_put_and_exit(0); return 0; } From 3409e4f1e8f239f0ed81be0b068ecf4e73e2e826 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 29 Nov 2021 15:51:25 +1100 Subject: [PATCH 09/51] NFSD: Make it possible to use svc_set_num_threads_sync nfsd cannot currently use svc_set_num_threads_sync. It instead uses svc_set_num_threads which does *not* wait for threads to all exit, and has a separate mechanism (nfsd_shutdown_complete) to wait for completion. The reason that nfsd is unlike other services is that nfsd threads can exit separately from svc_set_num_threads being called - they die on receipt of SIGKILL. Also, when the last thread exits, the service must be shut down (sockets closed). For this, the nfsd_mutex needs to be taken, and as that mutex needs to be held while svc_set_num_threads is called, the one cannot wait for the other. This patch changes the nfsd thread so that it can drop the ref on the service without blocking on nfsd_mutex, so that svc_set_num_threads_sync can be used: - if it can drop a non-last reference, it does that. This does not trigger shutdown and does not require a mutex. This will likely happen for all but the last thread signalled, and for all threads being shut down by nfsd_shutdown_threads() - if it can get the mutex without blocking (trylock), it does that and then drops the reference. This will likely happen for the last thread killed by SIGKILL - Otherwise there might be an unrelated task holding the mutex, possibly in another network namespace, or nfsd_shutdown_threads() might be just about to get a reference on the service, after which we can drop ours safely. We cannot conveniently get wakeup notifications on these events, and we are unlikely to need to, so we sleep briefly and check again. With this we can discard nfsd_shutdown_complete and nfsd_complete_shutdown(), and switch to svc_set_num_threads_sync. Signed-off-by: NeilBrown Signed-off-by: Chuck Lever --- fs/nfsd/netns.h | 3 --- fs/nfsd/nfssvc.c | 41 +++++++++++++++++++------------------- include/linux/sunrpc/svc.h | 13 ++++++++++++ 3 files changed, 33 insertions(+), 24 deletions(-) diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 08bcd8f23b013..1fd59eb0730bb 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -134,9 +134,6 @@ struct nfsd_net { wait_queue_head_t ntf_wq; atomic_t ntf_refcnt; - /* Allow umount to wait for nfsd state cleanup */ - struct completion nfsd_shutdown_complete; - /* * clientid and stateid data for construction of net unique COPY * stateids. diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 097abd8b059c5..d0d9107a1b93d 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -593,20 +593,10 @@ static const struct svc_serv_ops nfsd_thread_sv_ops = { .svo_shutdown = nfsd_last_thread, .svo_function = nfsd, .svo_enqueue_xprt = svc_xprt_do_enqueue, - .svo_setup = svc_set_num_threads, + .svo_setup = svc_set_num_threads_sync, .svo_module = THIS_MODULE, }; -static void nfsd_complete_shutdown(struct net *net) -{ - struct nfsd_net *nn = net_generic(net, nfsd_net_id); - - WARN_ON(!mutex_is_locked(&nfsd_mutex)); - - nn->nfsd_serv = NULL; - complete(&nn->nfsd_shutdown_complete); -} - void nfsd_shutdown_threads(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); @@ -624,8 +614,6 @@ void nfsd_shutdown_threads(struct net *net) serv->sv_ops->svo_setup(serv, NULL, 0); nfsd_put(net); mutex_unlock(&nfsd_mutex); - /* Wait for shutdown of nfsd_serv to complete */ - wait_for_completion(&nn->nfsd_shutdown_complete); } bool i_am_nfsd(void) @@ -650,7 +638,6 @@ int nfsd_create_serv(struct net *net) &nfsd_thread_sv_ops); if (nn->nfsd_serv == NULL) return -ENOMEM; - init_completion(&nn->nfsd_shutdown_complete); nn->nfsd_serv->sv_maxconn = nn->max_connections; error = svc_bind(nn->nfsd_serv, net); @@ -659,7 +646,7 @@ int nfsd_create_serv(struct net *net) * been set up yet. */ svc_put(nn->nfsd_serv); - nfsd_complete_shutdown(net); + nn->nfsd_serv = NULL; return error; } @@ -715,7 +702,7 @@ void nfsd_put(struct net *net) if (kref_put(&nn->nfsd_serv->sv_refcnt, nfsd_noop)) { svc_shutdown_net(nn->nfsd_serv, net); svc_destroy(&nn->nfsd_serv->sv_refcnt); - nfsd_complete_shutdown(net); + nn->nfsd_serv = NULL; } } @@ -743,7 +730,7 @@ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net) if (tot > NFSD_MAXSERVS) { /* total too large: scale down requested numbers */ for (i = 0; i < n && tot > 0; i++) { - int new = nthreads[i] * NFSD_MAXSERVS / tot; + int new = nthreads[i] * NFSD_MAXSERVS / tot; tot -= (nthreads[i] - new); nthreads[i] = new; } @@ -989,10 +976,22 @@ nfsd(void *vrqstp) /* Release the thread */ svc_exit_thread(rqstp); - /* Now if needed we call svc_destroy in appropriate context */ - mutex_lock(&nfsd_mutex); - nfsd_put(net); - mutex_unlock(&nfsd_mutex); + /* We need to drop a ref, but may not drop the last reference + * without holding nfsd_mutex, and we cannot wait for nfsd_mutex as that + * could deadlock with nfsd_shutdown_threads() waiting for us. + * So three options are: + * - drop a non-final reference, + * - get the mutex without waiting + * - sleep briefly andd try the above again + */ + while (!svc_put_not_last(nn->nfsd_serv)) { + if (mutex_trylock(&nfsd_mutex)) { + nfsd_put(net); + mutex_unlock(&nfsd_mutex); + break; + } + msleep(20); + } /* Release module */ module_put_and_exit(0); diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 3903b4ae8ac53..36bfc0281988b 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -141,6 +141,19 @@ static inline void svc_put(struct svc_serv *serv) kref_put(&serv->sv_refcnt, svc_destroy); } +/** + * svc_put_not_last - decrement non-final reference count on SUNRPC serv + * @serv: the svc_serv to have count decremented + * + * Returns: %true is refcount was decremented. + * + * If the refcount is 1, it is not decremented and instead failure is reported. + */ +static inline bool svc_put_not_last(struct svc_serv *serv) +{ + return refcount_dec_not_one(&serv->sv_refcnt.refcount); +} + /* * Maximum payload size supported by a kernel RPC server. * This is use to determine the max number of pages nfsd is From 3ebdbe5203a874614819700d3f470724cb803709 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 29 Nov 2021 15:51:25 +1100 Subject: [PATCH 10/51] SUNRPC: discard svo_setup and rename svc_set_num_threads_sync() The ->svo_setup callback serves no purpose. It is always called from within the same module that chooses which callback is needed. So discard it and call the relevant function directly. Now that svc_set_num_threads() is no longer used remove it and rename svc_set_num_threads_sync() to remove the "_sync" suffix. Signed-off-by: NeilBrown Signed-off-by: Chuck Lever --- fs/nfs/callback.c | 8 +++---- fs/nfsd/nfssvc.c | 11 ++++----- include/linux/sunrpc/svc.h | 4 ---- net/sunrpc/svc.c | 49 ++------------------------------------ 4 files changed, 10 insertions(+), 62 deletions(-) diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index d9d78ffd1d653..6cdc9d18a7dd3 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -172,9 +172,9 @@ static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt, if (serv->sv_nrthreads == nrservs) return 0; - ret = serv->sv_ops->svo_setup(serv, NULL, nrservs); + ret = svc_set_num_threads(serv, NULL, nrservs); if (ret) { - serv->sv_ops->svo_setup(serv, NULL, 0); + svc_set_num_threads(serv, NULL, 0); return ret; } dprintk("nfs_callback_up: service started\n"); @@ -235,14 +235,12 @@ static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, static const struct svc_serv_ops nfs40_cb_sv_ops = { .svo_function = nfs4_callback_svc, .svo_enqueue_xprt = svc_xprt_do_enqueue, - .svo_setup = svc_set_num_threads_sync, .svo_module = THIS_MODULE, }; #if defined(CONFIG_NFS_V4_1) static const struct svc_serv_ops nfs41_cb_sv_ops = { .svo_function = nfs41_callback_svc, .svo_enqueue_xprt = svc_xprt_do_enqueue, - .svo_setup = svc_set_num_threads_sync, .svo_module = THIS_MODULE, }; @@ -357,7 +355,7 @@ void nfs_callback_down(int minorversion, struct net *net) cb_info->users--; if (cb_info->users == 0) { svc_get(serv); - serv->sv_ops->svo_setup(serv, NULL, 0); + svc_set_num_threads(serv, NULL, 0); svc_put(serv); dprintk("nfs_callback_down: service destroyed\n"); cb_info->serv = NULL; diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index d0d9107a1b93d..020156e96bdb5 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -593,7 +593,6 @@ static const struct svc_serv_ops nfsd_thread_sv_ops = { .svo_shutdown = nfsd_last_thread, .svo_function = nfsd, .svo_enqueue_xprt = svc_xprt_do_enqueue, - .svo_setup = svc_set_num_threads_sync, .svo_module = THIS_MODULE, }; @@ -611,7 +610,7 @@ void nfsd_shutdown_threads(struct net *net) svc_get(serv); /* Kill outstanding nfsd threads */ - serv->sv_ops->svo_setup(serv, NULL, 0); + svc_set_num_threads(serv, NULL, 0); nfsd_put(net); mutex_unlock(&nfsd_mutex); } @@ -750,8 +749,9 @@ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net) /* apply the new numbers */ svc_get(nn->nfsd_serv); for (i = 0; i < n; i++) { - err = nn->nfsd_serv->sv_ops->svo_setup(nn->nfsd_serv, - &nn->nfsd_serv->sv_pools[i], nthreads[i]); + err = svc_set_num_threads(nn->nfsd_serv, + &nn->nfsd_serv->sv_pools[i], + nthreads[i]); if (err) break; } @@ -793,8 +793,7 @@ nfsd_svc(int nrservs, struct net *net, const struct cred *cred) error = nfsd_startup_net(net, cred); if (error) goto out_put; - error = nn->nfsd_serv->sv_ops->svo_setup(nn->nfsd_serv, - NULL, nrservs); + error = svc_set_num_threads(nn->nfsd_serv, NULL, nrservs); if (error) goto out_shutdown; error = nn->nfsd_serv->sv_nrthreads; diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 36bfc0281988b..0b38c6eaf9852 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -64,9 +64,6 @@ struct svc_serv_ops { /* queue up a transport for servicing */ void (*svo_enqueue_xprt)(struct svc_xprt *); - /* set up thread (or whatever) execution context */ - int (*svo_setup)(struct svc_serv *, struct svc_pool *, int); - /* optional module to count when adding threads (pooled svcs only) */ struct module *svo_module; }; @@ -541,7 +538,6 @@ void svc_pool_map_put(void); struct svc_serv * svc_create_pooled(struct svc_program *, unsigned int, const struct svc_serv_ops *); int svc_set_num_threads(struct svc_serv *, struct svc_pool *, int); -int svc_set_num_threads_sync(struct svc_serv *, struct svc_pool *, int); int svc_pool_stats_open(struct svc_serv *serv, struct file *file); void svc_shutdown_net(struct svc_serv *, struct net *); int svc_process(struct svc_rqst *); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 2b2042234e4bb..5513f8c9a8d63 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -743,58 +743,13 @@ svc_start_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) return 0; } - -/* destroy old threads */ -static int -svc_signal_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) -{ - struct task_struct *task; - unsigned int state = serv->sv_nrthreads-1; - - /* destroy old threads */ - do { - task = choose_victim(serv, pool, &state); - if (task == NULL) - break; - send_sig(SIGINT, task, 1); - nrservs++; - } while (nrservs < 0); - - return 0; -} - /* * Create or destroy enough new threads to make the number * of threads the given number. If `pool' is non-NULL, applies * only to threads in that pool, otherwise round-robins between * all pools. Caller must ensure that mutual exclusion between this and * server startup or shutdown. - * - * Destroying threads relies on the service threads filling in - * rqstp->rq_task, which only the nfs ones do. Assumes the serv - * has been created using svc_create_pooled(). - * - * Based on code that used to be in nfsd_svc() but tweaked - * to be pool-aware. */ -int -svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) -{ - if (pool == NULL) { - nrservs -= serv->sv_nrthreads; - } else { - spin_lock_bh(&pool->sp_lock); - nrservs -= pool->sp_nrthreads; - spin_unlock_bh(&pool->sp_lock); - } - - if (nrservs > 0) - return svc_start_kthreads(serv, pool, nrservs); - if (nrservs < 0) - return svc_signal_kthreads(serv, pool, nrservs); - return 0; -} -EXPORT_SYMBOL_GPL(svc_set_num_threads); /* destroy old threads */ static int @@ -815,7 +770,7 @@ svc_stop_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) } int -svc_set_num_threads_sync(struct svc_serv *serv, struct svc_pool *pool, int nrservs) +svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) { if (pool == NULL) { nrservs -= serv->sv_nrthreads; @@ -831,7 +786,7 @@ svc_set_num_threads_sync(struct svc_serv *serv, struct svc_pool *pool, int nrser return svc_stop_kthreads(serv, pool, nrservs); return 0; } -EXPORT_SYMBOL_GPL(svc_set_num_threads_sync); +EXPORT_SYMBOL_GPL(svc_set_num_threads); /** * svc_rqst_replace_page - Replace one page in rq_pages[] From d057cfec4940ce6eeffa22b4a71dec203b06cd55 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 29 Nov 2021 15:51:25 +1100 Subject: [PATCH 11/51] NFSD: simplify locking for network notifier. nfsd currently maintains an open-coded read/write semaphore (refcount and wait queue) for each network namespace to ensure the nfs service isn't shut down while the notifier is running. This is excessive. As there is unlikely to be contention between notifiers and they run without sleeping, a single spinlock is sufficient to avoid problems. Signed-off-by: NeilBrown [ cel: ensure nfsd_notifier_lock is static ] Signed-off-by: Chuck Lever --- fs/nfsd/netns.h | 3 --- fs/nfsd/nfsctl.c | 2 -- fs/nfsd/nfssvc.c | 38 ++++++++++++++++++++------------------ 3 files changed, 20 insertions(+), 23 deletions(-) diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 1fd59eb0730bb..021acdc0d03bb 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -131,9 +131,6 @@ struct nfsd_net { */ int keep_active; - wait_queue_head_t ntf_wq; - atomic_t ntf_refcnt; - /* * clientid and stateid data for construction of net unique COPY * stateids. diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 7b557eb8211a0..a8ad71567fc72 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1483,8 +1483,6 @@ static __net_init int nfsd_init_net(struct net *net) nn->clientid_counter = nn->clientid_base + 1; nn->s2s_cp_cl_id = nn->clientid_counter++; - atomic_set(&nn->ntf_refcnt, 0); - init_waitqueue_head(&nn->ntf_wq); seqlock_init(&nn->boot_lock); return 0; diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 020156e96bdb5..14c1ef6f8cc74 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -434,6 +434,7 @@ static void nfsd_shutdown_net(struct net *net) nfsd_shutdown_generic(); } +static DEFINE_SPINLOCK(nfsd_notifier_lock); static int nfsd_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr) { @@ -443,18 +444,17 @@ static int nfsd_inetaddr_event(struct notifier_block *this, unsigned long event, struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct sockaddr_in sin; - if ((event != NETDEV_DOWN) || - !atomic_inc_not_zero(&nn->ntf_refcnt)) + if (event != NETDEV_DOWN || !nn->nfsd_serv) goto out; + spin_lock(&nfsd_notifier_lock); if (nn->nfsd_serv) { dprintk("nfsd_inetaddr_event: removed %pI4\n", &ifa->ifa_local); sin.sin_family = AF_INET; sin.sin_addr.s_addr = ifa->ifa_local; svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin); } - atomic_dec(&nn->ntf_refcnt); - wake_up(&nn->ntf_wq); + spin_unlock(&nfsd_notifier_lock); out: return NOTIFY_DONE; @@ -474,10 +474,10 @@ static int nfsd_inet6addr_event(struct notifier_block *this, struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct sockaddr_in6 sin6; - if ((event != NETDEV_DOWN) || - !atomic_inc_not_zero(&nn->ntf_refcnt)) + if (event != NETDEV_DOWN || !nn->nfsd_serv) goto out; + spin_lock(&nfsd_notifier_lock); if (nn->nfsd_serv) { dprintk("nfsd_inet6addr_event: removed %pI6\n", &ifa->addr); sin6.sin6_family = AF_INET6; @@ -486,8 +486,8 @@ static int nfsd_inet6addr_event(struct notifier_block *this, sin6.sin6_scope_id = ifa->idev->dev->ifindex; svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin6); } - atomic_dec(&nn->ntf_refcnt); - wake_up(&nn->ntf_wq); + spin_unlock(&nfsd_notifier_lock); + out: return NOTIFY_DONE; } @@ -504,7 +504,6 @@ static void nfsd_last_thread(struct svc_serv *serv, struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); - atomic_dec(&nn->ntf_refcnt); /* check if the notifier still has clients */ if (atomic_dec_return(&nfsd_notifier_refcount) == 0) { unregister_inetaddr_notifier(&nfsd_inetaddr_notifier); @@ -512,7 +511,6 @@ static void nfsd_last_thread(struct svc_serv *serv, struct net *net) unregister_inet6addr_notifier(&nfsd_inet6addr_notifier); #endif } - wait_event(nn->ntf_wq, atomic_read(&nn->ntf_refcnt) == 0); /* * write_ports can create the server without actually starting @@ -624,6 +622,7 @@ int nfsd_create_serv(struct net *net) { int error; struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct svc_serv *serv; WARN_ON(!mutex_is_locked(&nfsd_mutex)); if (nn->nfsd_serv) { @@ -633,21 +632,23 @@ int nfsd_create_serv(struct net *net) if (nfsd_max_blksize == 0) nfsd_max_blksize = nfsd_get_default_max_blksize(); nfsd_reset_versions(nn); - nn->nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, - &nfsd_thread_sv_ops); - if (nn->nfsd_serv == NULL) + serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, + &nfsd_thread_sv_ops); + if (serv == NULL) return -ENOMEM; - nn->nfsd_serv->sv_maxconn = nn->max_connections; - error = svc_bind(nn->nfsd_serv, net); + serv->sv_maxconn = nn->max_connections; + error = svc_bind(serv, net); if (error < 0) { /* NOT nfsd_put() as notifiers (see below) haven't * been set up yet. */ - svc_put(nn->nfsd_serv); - nn->nfsd_serv = NULL; + svc_put(serv); return error; } + spin_lock(&nfsd_notifier_lock); + nn->nfsd_serv = serv; + spin_unlock(&nfsd_notifier_lock); set_max_drc(); /* check if the notifier is already set */ @@ -657,7 +658,6 @@ int nfsd_create_serv(struct net *net) register_inet6addr_notifier(&nfsd_inet6addr_notifier); #endif } - atomic_inc(&nn->ntf_refcnt); nfsd_reset_boot_verifier(nn); return 0; } @@ -701,7 +701,9 @@ void nfsd_put(struct net *net) if (kref_put(&nn->nfsd_serv->sv_refcnt, nfsd_noop)) { svc_shutdown_net(nn->nfsd_serv, net); svc_destroy(&nn->nfsd_serv->sv_refcnt); + spin_lock(&nfsd_notifier_lock); nn->nfsd_serv = NULL; + spin_unlock(&nfsd_notifier_lock); } } From 2840fe864c91a0fe822169b1fbfddbcac9aeac43 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 29 Nov 2021 15:51:25 +1100 Subject: [PATCH 12/51] lockd: introduce nlmsvc_serv lockd has two globals - nlmsvc_task and nlmsvc_rqst - but mostly it wants the 'struct svc_serv', and when it doesn't want it exactly it can get to what it wants from the serv. This patch is a first step to removing nlmsvc_task and nlmsvc_rqst. It introduces nlmsvc_serv to store the 'struct svc_serv*'. This is set as soon as the serv is created, and cleared only when it is destroyed. Signed-off-by: NeilBrown Signed-off-by: Chuck Lever --- fs/lockd/svc.c | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index a9669b106dbde..83874878f41d8 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -54,6 +54,7 @@ EXPORT_SYMBOL_GPL(nlmsvc_ops); static DEFINE_MUTEX(nlmsvc_mutex); static unsigned int nlmsvc_users; +static struct svc_serv *nlmsvc_serv; static struct task_struct *nlmsvc_task; static struct svc_rqst *nlmsvc_rqst; unsigned long nlmsvc_timeout; @@ -306,13 +307,12 @@ static int lockd_inetaddr_event(struct notifier_block *this, !atomic_inc_not_zero(&nlm_ntf_refcnt)) goto out; - if (nlmsvc_rqst) { + if (nlmsvc_serv) { dprintk("lockd_inetaddr_event: removed %pI4\n", &ifa->ifa_local); sin.sin_family = AF_INET; sin.sin_addr.s_addr = ifa->ifa_local; - svc_age_temp_xprts_now(nlmsvc_rqst->rq_server, - (struct sockaddr *)&sin); + svc_age_temp_xprts_now(nlmsvc_serv, (struct sockaddr *)&sin); } atomic_dec(&nlm_ntf_refcnt); wake_up(&nlm_ntf_wq); @@ -336,14 +336,13 @@ static int lockd_inet6addr_event(struct notifier_block *this, !atomic_inc_not_zero(&nlm_ntf_refcnt)) goto out; - if (nlmsvc_rqst) { + if (nlmsvc_serv) { dprintk("lockd_inet6addr_event: removed %pI6\n", &ifa->addr); sin6.sin6_family = AF_INET6; sin6.sin6_addr = ifa->addr; if (ipv6_addr_type(&sin6.sin6_addr) & IPV6_ADDR_LINKLOCAL) sin6.sin6_scope_id = ifa->idev->dev->ifindex; - svc_age_temp_xprts_now(nlmsvc_rqst->rq_server, - (struct sockaddr *)&sin6); + svc_age_temp_xprts_now(nlmsvc_serv, (struct sockaddr *)&sin6); } atomic_dec(&nlm_ntf_refcnt); wake_up(&nlm_ntf_wq); @@ -423,15 +422,17 @@ static const struct svc_serv_ops lockd_sv_ops = { .svo_enqueue_xprt = svc_xprt_do_enqueue, }; -static struct svc_serv *lockd_create_svc(void) +static int lockd_create_svc(void) { struct svc_serv *serv; /* * Check whether we're already up and running. */ - if (nlmsvc_rqst) - return svc_get(nlmsvc_rqst->rq_server); + if (nlmsvc_serv) { + svc_get(nlmsvc_serv); + return 0; + } /* * Sanity check: if there's no pid, @@ -448,14 +449,15 @@ static struct svc_serv *lockd_create_svc(void) serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, &lockd_sv_ops); if (!serv) { printk(KERN_WARNING "lockd_up: create service failed\n"); - return ERR_PTR(-ENOMEM); + return -ENOMEM; } + nlmsvc_serv = serv; register_inetaddr_notifier(&lockd_inetaddr_notifier); #if IS_ENABLED(CONFIG_IPV6) register_inet6addr_notifier(&lockd_inet6addr_notifier); #endif dprintk("lockd_up: service created\n"); - return serv; + return 0; } /* @@ -468,11 +470,10 @@ int lockd_up(struct net *net, const struct cred *cred) mutex_lock(&nlmsvc_mutex); - serv = lockd_create_svc(); - if (IS_ERR(serv)) { - error = PTR_ERR(serv); + error = lockd_create_svc(); + if (error) goto err_create; - } + serv = nlmsvc_serv; error = lockd_up_net(serv, net, cred); if (error < 0) { @@ -487,6 +488,8 @@ int lockd_up(struct net *net, const struct cred *cred) } nlmsvc_users++; err_put: + if (nlmsvc_users == 0) + nlmsvc_serv = NULL; svc_put(serv); err_create: mutex_unlock(&nlmsvc_mutex); @@ -501,7 +504,7 @@ void lockd_down(struct net *net) { mutex_lock(&nlmsvc_mutex); - lockd_down_net(nlmsvc_rqst->rq_server, net); + lockd_down_net(nlmsvc_serv, net); if (nlmsvc_users) { if (--nlmsvc_users) goto out; @@ -519,6 +522,7 @@ lockd_down(struct net *net) dprintk("lockd_down: service stopped\n"); lockd_svc_exit_thread(); dprintk("lockd_down: service destroyed\n"); + nlmsvc_serv = NULL; nlmsvc_task = NULL; nlmsvc_rqst = NULL; out: From 5a8a7ff57421b7de3ae72019938ffb5daaee36e7 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 29 Nov 2021 15:51:25 +1100 Subject: [PATCH 13/51] lockd: simplify management of network status notifiers Now that the network status notifiers use nlmsvc_serv rather then nlmsvc_rqst the management can be simplified. Notifier unregistration synchronises with any pending notifications so providing we unregister before nlm_serv is freed no further interlock is required. So we move the unregister call to just before the thread is killed (which destroys the service) and just before the service is destroyed in the failure-path of lockd_up(). Then nlm_ntf_refcnt and nlm_ntf_wq can be removed. Signed-off-by: NeilBrown Signed-off-by: Chuck Lever --- fs/lockd/svc.c | 35 +++++++++-------------------------- 1 file changed, 9 insertions(+), 26 deletions(-) diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 83874878f41d8..20cebb191350f 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -59,9 +59,6 @@ static struct task_struct *nlmsvc_task; static struct svc_rqst *nlmsvc_rqst; unsigned long nlmsvc_timeout; -static atomic_t nlm_ntf_refcnt = ATOMIC_INIT(0); -static DECLARE_WAIT_QUEUE_HEAD(nlm_ntf_wq); - unsigned int lockd_net_id; /* @@ -303,8 +300,7 @@ static int lockd_inetaddr_event(struct notifier_block *this, struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; struct sockaddr_in sin; - if ((event != NETDEV_DOWN) || - !atomic_inc_not_zero(&nlm_ntf_refcnt)) + if (event != NETDEV_DOWN) goto out; if (nlmsvc_serv) { @@ -314,8 +310,6 @@ static int lockd_inetaddr_event(struct notifier_block *this, sin.sin_addr.s_addr = ifa->ifa_local; svc_age_temp_xprts_now(nlmsvc_serv, (struct sockaddr *)&sin); } - atomic_dec(&nlm_ntf_refcnt); - wake_up(&nlm_ntf_wq); out: return NOTIFY_DONE; @@ -332,8 +326,7 @@ static int lockd_inet6addr_event(struct notifier_block *this, struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr; struct sockaddr_in6 sin6; - if ((event != NETDEV_DOWN) || - !atomic_inc_not_zero(&nlm_ntf_refcnt)) + if (event != NETDEV_DOWN) goto out; if (nlmsvc_serv) { @@ -344,8 +337,6 @@ static int lockd_inet6addr_event(struct notifier_block *this, sin6.sin6_scope_id = ifa->idev->dev->ifindex; svc_age_temp_xprts_now(nlmsvc_serv, (struct sockaddr *)&sin6); } - atomic_dec(&nlm_ntf_refcnt); - wake_up(&nlm_ntf_wq); out: return NOTIFY_DONE; @@ -362,14 +353,6 @@ static void lockd_unregister_notifiers(void) #if IS_ENABLED(CONFIG_IPV6) unregister_inet6addr_notifier(&lockd_inet6addr_notifier); #endif - wait_event(nlm_ntf_wq, atomic_read(&nlm_ntf_refcnt) == 0); -} - -static void lockd_svc_exit_thread(void) -{ - atomic_dec(&nlm_ntf_refcnt); - lockd_unregister_notifiers(); - svc_exit_thread(nlmsvc_rqst); } static int lockd_start_svc(struct svc_serv *serv) @@ -388,11 +371,9 @@ static int lockd_start_svc(struct svc_serv *serv) printk(KERN_WARNING "lockd_up: svc_rqst allocation failed, error=%d\n", error); - lockd_unregister_notifiers(); goto out_rqst; } - atomic_inc(&nlm_ntf_refcnt); svc_sock_update_bufs(serv); serv->sv_maxconn = nlm_max_connections; @@ -410,7 +391,7 @@ static int lockd_start_svc(struct svc_serv *serv) return 0; out_task: - lockd_svc_exit_thread(); + svc_exit_thread(nlmsvc_rqst); nlmsvc_task = NULL; out_rqst: nlmsvc_rqst = NULL; @@ -477,7 +458,6 @@ int lockd_up(struct net *net, const struct cred *cred) error = lockd_up_net(serv, net, cred); if (error < 0) { - lockd_unregister_notifiers(); goto err_put; } @@ -488,8 +468,10 @@ int lockd_up(struct net *net, const struct cred *cred) } nlmsvc_users++; err_put: - if (nlmsvc_users == 0) + if (nlmsvc_users == 0) { + lockd_unregister_notifiers(); nlmsvc_serv = NULL; + } svc_put(serv); err_create: mutex_unlock(&nlmsvc_mutex); @@ -518,13 +500,14 @@ lockd_down(struct net *net) printk(KERN_ERR "lockd_down: no lockd running.\n"); BUG(); } + lockd_unregister_notifiers(); kthread_stop(nlmsvc_task); dprintk("lockd_down: service stopped\n"); - lockd_svc_exit_thread(); + svc_exit_thread(nlmsvc_rqst); + nlmsvc_rqst = NULL; dprintk("lockd_down: service destroyed\n"); nlmsvc_serv = NULL; nlmsvc_task = NULL; - nlmsvc_rqst = NULL; out: mutex_unlock(&nlmsvc_mutex); } From b73a2972041bee70eb0cbbb25fa77828c63c916b Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 29 Nov 2021 15:51:25 +1100 Subject: [PATCH 14/51] lockd: move lockd_start_svc() call into lockd_create_svc() lockd_start_svc() only needs to be called once, just after the svc is created. If the start fails, the svc is discarded too. It thus makes sense to call lockd_start_svc() from lockd_create_svc(). This allows us to remove the test against nlmsvc_rqst at the start of lockd_start_svc() - it must always be NULL. lockd_up() only held an extra reference on the svc until a thread was created - then it dropped it. The thread - and thus the extra reference - will remain until kthread_stop() is called. Now that the thread is created in lockd_create_svc(), the extra reference can be dropped there. So the 'serv' variable is no longer needed in lockd_up(). Signed-off-by: NeilBrown Signed-off-by: Chuck Lever --- fs/lockd/svc.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 20cebb191350f..91e7c839841ec 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -359,9 +359,6 @@ static int lockd_start_svc(struct svc_serv *serv) { int error; - if (nlmsvc_rqst) - return 0; - /* * Create the kernel thread and wait for it to start. */ @@ -406,6 +403,7 @@ static const struct svc_serv_ops lockd_sv_ops = { static int lockd_create_svc(void) { struct svc_serv *serv; + int error; /* * Check whether we're already up and running. @@ -432,6 +430,13 @@ static int lockd_create_svc(void) printk(KERN_WARNING "lockd_up: create service failed\n"); return -ENOMEM; } + + error = lockd_start_svc(serv); + /* The thread now holds the only reference */ + svc_put(serv); + if (error < 0) + return error; + nlmsvc_serv = serv; register_inetaddr_notifier(&lockd_inetaddr_notifier); #if IS_ENABLED(CONFIG_IPV6) @@ -446,7 +451,6 @@ static int lockd_create_svc(void) */ int lockd_up(struct net *net, const struct cred *cred) { - struct svc_serv *serv; int error; mutex_lock(&nlmsvc_mutex); @@ -454,25 +458,19 @@ int lockd_up(struct net *net, const struct cred *cred) error = lockd_create_svc(); if (error) goto err_create; - serv = nlmsvc_serv; - error = lockd_up_net(serv, net, cred); + error = lockd_up_net(nlmsvc_serv, net, cred); if (error < 0) { goto err_put; } - error = lockd_start_svc(serv); - if (error < 0) { - lockd_down_net(serv, net); - goto err_put; - } nlmsvc_users++; err_put: if (nlmsvc_users == 0) { lockd_unregister_notifiers(); + kthread_stop(nlmsvc_task); nlmsvc_serv = NULL; } - svc_put(serv); err_create: mutex_unlock(&nlmsvc_mutex); return error; From 6a4e2527a63620a820c4ebf3596b57176da26fb3 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 29 Nov 2021 15:51:25 +1100 Subject: [PATCH 15/51] lockd: move svc_exit_thread() into the thread The normal place to call svc_exit_thread() is from the thread itself just before it exists. Do this for lockd. This means that nlmsvc_rqst is not used out side of lockd_start_svc(), so it can be made local to that function, and renamed to 'rqst'. Signed-off-by: NeilBrown Signed-off-by: Chuck Lever --- fs/lockd/svc.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 91e7c839841ec..9aa499a761591 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -56,7 +56,6 @@ static DEFINE_MUTEX(nlmsvc_mutex); static unsigned int nlmsvc_users; static struct svc_serv *nlmsvc_serv; static struct task_struct *nlmsvc_task; -static struct svc_rqst *nlmsvc_rqst; unsigned long nlmsvc_timeout; unsigned int lockd_net_id; @@ -182,6 +181,11 @@ lockd(void *vrqstp) nlm_shutdown_hosts(); cancel_delayed_work_sync(&ln->grace_period_end); locks_end_grace(&ln->lockd_manager); + + dprintk("lockd_down: service stopped\n"); + + svc_exit_thread(rqstp); + return 0; } @@ -358,13 +362,14 @@ static void lockd_unregister_notifiers(void) static int lockd_start_svc(struct svc_serv *serv) { int error; + struct svc_rqst *rqst; /* * Create the kernel thread and wait for it to start. */ - nlmsvc_rqst = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE); - if (IS_ERR(nlmsvc_rqst)) { - error = PTR_ERR(nlmsvc_rqst); + rqst = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE); + if (IS_ERR(rqst)) { + error = PTR_ERR(rqst); printk(KERN_WARNING "lockd_up: svc_rqst allocation failed, error=%d\n", error); @@ -374,24 +379,23 @@ static int lockd_start_svc(struct svc_serv *serv) svc_sock_update_bufs(serv); serv->sv_maxconn = nlm_max_connections; - nlmsvc_task = kthread_create(lockd, nlmsvc_rqst, "%s", serv->sv_name); + nlmsvc_task = kthread_create(lockd, rqst, "%s", serv->sv_name); if (IS_ERR(nlmsvc_task)) { error = PTR_ERR(nlmsvc_task); printk(KERN_WARNING "lockd_up: kthread_run failed, error=%d\n", error); goto out_task; } - nlmsvc_rqst->rq_task = nlmsvc_task; + rqst->rq_task = nlmsvc_task; wake_up_process(nlmsvc_task); dprintk("lockd_up: service started\n"); return 0; out_task: - svc_exit_thread(nlmsvc_rqst); + svc_exit_thread(rqst); nlmsvc_task = NULL; out_rqst: - nlmsvc_rqst = NULL; return error; } @@ -500,9 +504,6 @@ lockd_down(struct net *net) } lockd_unregister_notifiers(); kthread_stop(nlmsvc_task); - dprintk("lockd_down: service stopped\n"); - svc_exit_thread(nlmsvc_rqst); - nlmsvc_rqst = NULL; dprintk("lockd_down: service destroyed\n"); nlmsvc_serv = NULL; nlmsvc_task = NULL; From 865b674069e05e5779fcf8cf7a166d2acb7e930b Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 29 Nov 2021 15:51:25 +1100 Subject: [PATCH 16/51] lockd: introduce lockd_put() There is some cleanup that is duplicated in lockd_down() and the failure path of lockd_up(). Factor these out into a new lockd_put() and call it from both places. lockd_put() does *not* take the mutex - that must be held by the caller. It decrements nlmsvc_users and if that reaches zero, it cleans up. Signed-off-by: NeilBrown Signed-off-by: Chuck Lever --- fs/lockd/svc.c | 64 +++++++++++++++++++++----------------------------- 1 file changed, 27 insertions(+), 37 deletions(-) diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 9aa499a761591..7f12c280fd30d 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -351,14 +351,6 @@ static struct notifier_block lockd_inet6addr_notifier = { }; #endif -static void lockd_unregister_notifiers(void) -{ - unregister_inetaddr_notifier(&lockd_inetaddr_notifier); -#if IS_ENABLED(CONFIG_IPV6) - unregister_inet6addr_notifier(&lockd_inet6addr_notifier); -#endif -} - static int lockd_start_svc(struct svc_serv *serv) { int error; @@ -450,6 +442,27 @@ static int lockd_create_svc(void) return 0; } +static void lockd_put(void) +{ + if (WARN(nlmsvc_users <= 0, "lockd_down: no users!\n")) + return; + if (--nlmsvc_users) + return; + + unregister_inetaddr_notifier(&lockd_inetaddr_notifier); +#if IS_ENABLED(CONFIG_IPV6) + unregister_inet6addr_notifier(&lockd_inet6addr_notifier); +#endif + + if (nlmsvc_task) { + kthread_stop(nlmsvc_task); + dprintk("lockd_down: service stopped\n"); + nlmsvc_task = NULL; + } + nlmsvc_serv = NULL; + dprintk("lockd_down: service destroyed\n"); +} + /* * Bring up the lockd process if it's not already up. */ @@ -461,21 +474,16 @@ int lockd_up(struct net *net, const struct cred *cred) error = lockd_create_svc(); if (error) - goto err_create; + goto err; + nlmsvc_users++; error = lockd_up_net(nlmsvc_serv, net, cred); if (error < 0) { - goto err_put; + lockd_put(); + goto err; } - nlmsvc_users++; -err_put: - if (nlmsvc_users == 0) { - lockd_unregister_notifiers(); - kthread_stop(nlmsvc_task); - nlmsvc_serv = NULL; - } -err_create: +err: mutex_unlock(&nlmsvc_mutex); return error; } @@ -489,25 +497,7 @@ lockd_down(struct net *net) { mutex_lock(&nlmsvc_mutex); lockd_down_net(nlmsvc_serv, net); - if (nlmsvc_users) { - if (--nlmsvc_users) - goto out; - } else { - printk(KERN_ERR "lockd_down: no users! task=%p\n", - nlmsvc_task); - BUG(); - } - - if (!nlmsvc_task) { - printk(KERN_ERR "lockd_down: no lockd running.\n"); - BUG(); - } - lockd_unregister_notifiers(); - kthread_stop(nlmsvc_task); - dprintk("lockd_down: service destroyed\n"); - nlmsvc_serv = NULL; - nlmsvc_task = NULL; -out: + lockd_put(); mutex_unlock(&nlmsvc_mutex); } EXPORT_SYMBOL_GPL(lockd_down); From ecd3ad68d2c6d3ae178a63a2d9a02c392904fd36 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 29 Nov 2021 15:51:25 +1100 Subject: [PATCH 17/51] lockd: rename lockd_create_svc() to lockd_get() lockd_create_svc() already does an svc_get() if the service already exists, so it is more like a "get" than a "create". So: - Move the increment of nlmsvc_users into the function as well - rename to lockd_get(). It is now the inverse of lockd_put(). Signed-off-by: NeilBrown Signed-off-by: Chuck Lever --- fs/lockd/svc.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 7f12c280fd30d..1a7c11118b320 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -396,16 +396,14 @@ static const struct svc_serv_ops lockd_sv_ops = { .svo_enqueue_xprt = svc_xprt_do_enqueue, }; -static int lockd_create_svc(void) +static int lockd_get(void) { struct svc_serv *serv; int error; - /* - * Check whether we're already up and running. - */ if (nlmsvc_serv) { svc_get(nlmsvc_serv); + nlmsvc_users++; return 0; } @@ -439,6 +437,7 @@ static int lockd_create_svc(void) register_inet6addr_notifier(&lockd_inet6addr_notifier); #endif dprintk("lockd_up: service created\n"); + nlmsvc_users++; return 0; } @@ -472,10 +471,9 @@ int lockd_up(struct net *net, const struct cred *cred) mutex_lock(&nlmsvc_mutex); - error = lockd_create_svc(); + error = lockd_get(); if (error) goto err; - nlmsvc_users++; error = lockd_up_net(nlmsvc_serv, net, cred); if (error < 0) { From cf0e124e0a489944d08fcc3c694d2b234d2cc658 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 29 Nov 2021 15:51:25 +1100 Subject: [PATCH 18/51] SUNRPC: move the pool_map definitions (back) into svc.c These definitions are not used outside of svc.c, and there is no evidence that they ever have been. So move them into svc.c and make the declarations 'static'. Signed-off-by: NeilBrown Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 25 ------------------------- net/sunrpc/svc.c | 31 +++++++++++++++++++++++++------ 2 files changed, 25 insertions(+), 31 deletions(-) diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 0b38c6eaf9852..d69e6108cb835 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -494,29 +494,6 @@ struct svc_procedure { const char * pc_name; /* for display */ }; -/* - * Mode for mapping cpus to pools. - */ -enum { - SVC_POOL_AUTO = -1, /* choose one of the others */ - SVC_POOL_GLOBAL, /* no mapping, just a single global pool - * (legacy & UP mode) */ - SVC_POOL_PERCPU, /* one pool per cpu */ - SVC_POOL_PERNODE /* one pool per numa node */ -}; - -struct svc_pool_map { - int count; /* How many svc_servs use us */ - int mode; /* Note: int not enum to avoid - * warnings about "enumeration value - * not handled in switch" */ - unsigned int npools; - unsigned int *pool_to; /* maps pool id to cpu or node */ - unsigned int *to_pool; /* maps cpu or node to pool id */ -}; - -extern struct svc_pool_map svc_pool_map; - /* * Function prototypes. */ @@ -533,8 +510,6 @@ void svc_rqst_replace_page(struct svc_rqst *rqstp, struct page *page); void svc_rqst_free(struct svc_rqst *); void svc_exit_thread(struct svc_rqst *); -unsigned int svc_pool_map_get(void); -void svc_pool_map_put(void); struct svc_serv * svc_create_pooled(struct svc_program *, unsigned int, const struct svc_serv_ops *); int svc_set_num_threads(struct svc_serv *, struct svc_pool *, int); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 5513f8c9a8d63..f0dd9ef7e0cde 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -41,14 +41,35 @@ static void svc_unregister(const struct svc_serv *serv, struct net *net); #define SVC_POOL_DEFAULT SVC_POOL_GLOBAL +/* + * Mode for mapping cpus to pools. + */ +enum { + SVC_POOL_AUTO = -1, /* choose one of the others */ + SVC_POOL_GLOBAL, /* no mapping, just a single global pool + * (legacy & UP mode) */ + SVC_POOL_PERCPU, /* one pool per cpu */ + SVC_POOL_PERNODE /* one pool per numa node */ +}; + /* * Structure for mapping cpus to pools and vice versa. * Setup once during sunrpc initialisation. */ -struct svc_pool_map svc_pool_map = { + +struct svc_pool_map { + int count; /* How many svc_servs use us */ + int mode; /* Note: int not enum to avoid + * warnings about "enumeration value + * not handled in switch" */ + unsigned int npools; + unsigned int *pool_to; /* maps pool id to cpu or node */ + unsigned int *to_pool; /* maps cpu or node to pool id */ +}; + +static struct svc_pool_map svc_pool_map = { .mode = SVC_POOL_DEFAULT }; -EXPORT_SYMBOL_GPL(svc_pool_map); static DEFINE_MUTEX(svc_pool_map_mutex);/* protects svc_pool_map.count only */ @@ -222,7 +243,7 @@ svc_pool_map_init_pernode(struct svc_pool_map *m) * vice versa). Initialise the map if we're the first user. * Returns the number of pools. */ -unsigned int +static unsigned int svc_pool_map_get(void) { struct svc_pool_map *m = &svc_pool_map; @@ -257,7 +278,6 @@ svc_pool_map_get(void) mutex_unlock(&svc_pool_map_mutex); return m->npools; } -EXPORT_SYMBOL_GPL(svc_pool_map_get); /* * Drop a reference to the global map of cpus to pools. @@ -266,7 +286,7 @@ EXPORT_SYMBOL_GPL(svc_pool_map_get); * mode using the pool_mode module option without * rebooting or re-loading sunrpc.ko. */ -void +static void svc_pool_map_put(void) { struct svc_pool_map *m = &svc_pool_map; @@ -283,7 +303,6 @@ svc_pool_map_put(void) mutex_unlock(&svc_pool_map_mutex); } -EXPORT_SYMBOL_GPL(svc_pool_map_put); static int svc_pool_map_get_node(unsigned int pidx) { From 93aa619eb0b42eec2f3a9b4d9db41f5095390aec Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 29 Nov 2021 15:51:25 +1100 Subject: [PATCH 19/51] SUNRPC: always treat sv_nrpools==1 as "not pooled" Currently 'pooled' services hold a reference on the pool_map, and 'unpooled' services do not. svc_destroy() uses the presence of ->svo_function (via svc_serv_is_pooled()) to determine if the reference should be dropped. There is no direct correlation between being pooled and the use of svo_function, though in practice, lockd is the only non-pooled service, and the only one not to use svo_function. This is untidy and would cause problems if we changed lockd to use svc_set_num_threads(), which requires the use of ->svo_function. So change the test for "is the service pooled" to "is sv_nrpools > 1". This means that when svc_pool_map_get() returns 1, it must NOT take a reference to the pool. We discard svc_serv_is_pooled(), and test sv_nrpools directly. Signed-off-by: NeilBrown Signed-off-by: Chuck Lever --- net/sunrpc/svc.c | 54 ++++++++++++++++++++++++++---------------------- 1 file changed, 29 insertions(+), 25 deletions(-) diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index f0dd9ef7e0cde..5fbe7f55289e1 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -37,8 +37,6 @@ static void svc_unregister(const struct svc_serv *serv, struct net *net); -#define svc_serv_is_pooled(serv) ((serv)->sv_ops->svo_function) - #define SVC_POOL_DEFAULT SVC_POOL_GLOBAL /* @@ -240,8 +238,10 @@ svc_pool_map_init_pernode(struct svc_pool_map *m) /* * Add a reference to the global map of cpus to pools (and - * vice versa). Initialise the map if we're the first user. - * Returns the number of pools. + * vice versa) if pools are in use. + * Initialise the map if we're the first user. + * Returns the number of pools. If this is '1', no reference + * was taken. */ static unsigned int svc_pool_map_get(void) @@ -253,6 +253,7 @@ svc_pool_map_get(void) if (m->count++) { mutex_unlock(&svc_pool_map_mutex); + WARN_ON_ONCE(m->npools <= 1); return m->npools; } @@ -268,29 +269,36 @@ svc_pool_map_get(void) break; } - if (npools < 0) { + if (npools <= 0) { /* default, or memory allocation failure */ npools = 1; m->mode = SVC_POOL_GLOBAL; } m->npools = npools; + if (npools == 1) + /* service is unpooled, so doesn't hold a reference */ + m->count--; + mutex_unlock(&svc_pool_map_mutex); - return m->npools; + return npools; } /* - * Drop a reference to the global map of cpus to pools. + * Drop a reference to the global map of cpus to pools, if + * pools were in use, i.e. if npools > 1. * When the last reference is dropped, the map data is * freed; this allows the sysadmin to change the pool * mode using the pool_mode module option without * rebooting or re-loading sunrpc.ko. */ static void -svc_pool_map_put(void) +svc_pool_map_put(int npools) { struct svc_pool_map *m = &svc_pool_map; + if (npools <= 1) + return; mutex_lock(&svc_pool_map_mutex); if (!--m->count) { @@ -359,21 +367,18 @@ svc_pool_for_cpu(struct svc_serv *serv, int cpu) struct svc_pool_map *m = &svc_pool_map; unsigned int pidx = 0; - /* - * An uninitialised map happens in a pure client when - * lockd is brought up, so silently treat it the - * same as SVC_POOL_GLOBAL. - */ - if (svc_serv_is_pooled(serv)) { - switch (m->mode) { - case SVC_POOL_PERCPU: - pidx = m->to_pool[cpu]; - break; - case SVC_POOL_PERNODE: - pidx = m->to_pool[cpu_to_node(cpu)]; - break; - } + if (serv->sv_nrpools <= 1) + return serv->sv_pools; + + switch (m->mode) { + case SVC_POOL_PERCPU: + pidx = m->to_pool[cpu]; + break; + case SVC_POOL_PERNODE: + pidx = m->to_pool[cpu_to_node(cpu)]; + break; } + return &serv->sv_pools[pidx % serv->sv_nrpools]; } @@ -526,7 +531,7 @@ svc_create_pooled(struct svc_program *prog, unsigned int bufsize, goto out_err; return serv; out_err: - svc_pool_map_put(); + svc_pool_map_put(npools); return NULL; } EXPORT_SYMBOL_GPL(svc_create_pooled); @@ -561,8 +566,7 @@ svc_destroy(struct kref *ref) cache_clean_deferred(serv); - if (svc_serv_is_pooled(serv)) - svc_pool_map_put(); + svc_pool_map_put(serv->sv_nrpools); kfree(serv->sv_pools); kfree(serv); From 6b044fbaab02292fedb17565dbb3f2528083b169 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 29 Nov 2021 15:51:25 +1100 Subject: [PATCH 20/51] lockd: use svc_set_num_threads() for thread start and stop svc_set_num_threads() does everything that lockd_start_svc() does, except set sv_maxconn. It also (when passed 0) finds the threads and stops them with kthread_stop(). So move the setting for sv_maxconn, and use svc_set_num_thread() We now don't need nlmsvc_task. Now that we use svc_set_num_threads() it makes sense to set svo_module. This request that the thread exists with module_put_and_exit(). Also fix the documentation for svo_module to make this explicit. svc_prepare_thread is now only used where it is defined, so it can be made static. Signed-off-by: NeilBrown Signed-off-by: Chuck Lever --- fs/lockd/svc.c | 58 ++++++-------------------------------- include/linux/sunrpc/svc.h | 6 ++-- net/sunrpc/svc.c | 3 +- 3 files changed, 12 insertions(+), 55 deletions(-) diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 1a7c11118b320..4defefd89cbff 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -55,7 +55,6 @@ EXPORT_SYMBOL_GPL(nlmsvc_ops); static DEFINE_MUTEX(nlmsvc_mutex); static unsigned int nlmsvc_users; static struct svc_serv *nlmsvc_serv; -static struct task_struct *nlmsvc_task; unsigned long nlmsvc_timeout; unsigned int lockd_net_id; @@ -186,7 +185,7 @@ lockd(void *vrqstp) svc_exit_thread(rqstp); - return 0; + module_put_and_exit(0); } static int create_lockd_listener(struct svc_serv *serv, const char *name, @@ -292,8 +291,8 @@ static void lockd_down_net(struct svc_serv *serv, struct net *net) __func__, net->ns.inum); } } else { - pr_err("%s: no users! task=%p, net=%x\n", - __func__, nlmsvc_task, net->ns.inum); + pr_err("%s: no users! net=%x\n", + __func__, net->ns.inum); BUG(); } } @@ -351,49 +350,11 @@ static struct notifier_block lockd_inet6addr_notifier = { }; #endif -static int lockd_start_svc(struct svc_serv *serv) -{ - int error; - struct svc_rqst *rqst; - - /* - * Create the kernel thread and wait for it to start. - */ - rqst = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE); - if (IS_ERR(rqst)) { - error = PTR_ERR(rqst); - printk(KERN_WARNING - "lockd_up: svc_rqst allocation failed, error=%d\n", - error); - goto out_rqst; - } - - svc_sock_update_bufs(serv); - serv->sv_maxconn = nlm_max_connections; - - nlmsvc_task = kthread_create(lockd, rqst, "%s", serv->sv_name); - if (IS_ERR(nlmsvc_task)) { - error = PTR_ERR(nlmsvc_task); - printk(KERN_WARNING - "lockd_up: kthread_run failed, error=%d\n", error); - goto out_task; - } - rqst->rq_task = nlmsvc_task; - wake_up_process(nlmsvc_task); - - dprintk("lockd_up: service started\n"); - return 0; - -out_task: - svc_exit_thread(rqst); - nlmsvc_task = NULL; -out_rqst: - return error; -} - static const struct svc_serv_ops lockd_sv_ops = { .svo_shutdown = svc_rpcb_cleanup, + .svo_function = lockd, .svo_enqueue_xprt = svc_xprt_do_enqueue, + .svo_module = THIS_MODULE, }; static int lockd_get(void) @@ -425,7 +386,8 @@ static int lockd_get(void) return -ENOMEM; } - error = lockd_start_svc(serv); + serv->sv_maxconn = nlm_max_connections; + error = svc_set_num_threads(serv, NULL, 1); /* The thread now holds the only reference */ svc_put(serv); if (error < 0) @@ -453,11 +415,7 @@ static void lockd_put(void) unregister_inet6addr_notifier(&lockd_inet6addr_notifier); #endif - if (nlmsvc_task) { - kthread_stop(nlmsvc_task); - dprintk("lockd_down: service stopped\n"); - nlmsvc_task = NULL; - } + svc_set_num_threads(nlmsvc_serv, NULL, 0); nlmsvc_serv = NULL; dprintk("lockd_down: service destroyed\n"); } diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index d69e6108cb835..cf175d47c6b7c 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -64,7 +64,9 @@ struct svc_serv_ops { /* queue up a transport for servicing */ void (*svo_enqueue_xprt)(struct svc_xprt *); - /* optional module to count when adding threads (pooled svcs only) */ + /* optional module to count when adding threads. + * Thread function must call module_put_and_exit() to exit. + */ struct module *svo_module; }; @@ -504,8 +506,6 @@ struct svc_serv *svc_create(struct svc_program *, unsigned int, const struct svc_serv_ops *); struct svc_rqst *svc_rqst_alloc(struct svc_serv *serv, struct svc_pool *pool, int node); -struct svc_rqst *svc_prepare_thread(struct svc_serv *serv, - struct svc_pool *pool, int node); void svc_rqst_replace_page(struct svc_rqst *rqstp, struct page *page); void svc_rqst_free(struct svc_rqst *); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 5fbe7f55289e1..2aabec2b4becc 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -652,7 +652,7 @@ svc_rqst_alloc(struct svc_serv *serv, struct svc_pool *pool, int node) } EXPORT_SYMBOL_GPL(svc_rqst_alloc); -struct svc_rqst * +static struct svc_rqst * svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node) { struct svc_rqst *rqstp; @@ -672,7 +672,6 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node) spin_unlock_bh(&pool->sp_lock); return rqstp; } -EXPORT_SYMBOL_GPL(svc_prepare_thread); /* * Choose a pool in which to create a new thread, for svc_set_num_threads From 23a1a573c61ccb5e7829c1f5472d3e025293a031 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 29 Nov 2021 15:51:25 +1100 Subject: [PATCH 21/51] NFS: switch the callback service back to non-pooled. Now that thread management is consistent there is no need for nfs-callback to use svc_create_pooled() as introduced in Commit df807fffaabd ("NFSv4.x/callback: Create the callback service through svc_create_pooled"). So switch back to svc_create(). If service pools were configured, but the number of threads were left at '1', nfs callback may not work reliably when svc_create_pooled() is used. Signed-off-by: NeilBrown Signed-off-by: Chuck Lever --- fs/nfs/callback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 6cdc9d18a7dd3..c4994c1d4e36c 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -286,7 +286,7 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion) printk(KERN_WARNING "nfs_callback_create_svc: no kthread, %d users??\n", cb_info->users); - serv = svc_create_pooled(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, sv_ops); + serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, sv_ops); if (!serv) { printk(KERN_ERR "nfs_callback_create_svc: create service failed\n"); return ERR_PTR(-ENOMEM); From 7578b2f628db27281d3165af0aa862311883a858 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 30 Sep 2021 19:10:03 -0400 Subject: [PATCH 22/51] NFSD: Remove be32_to_cpu() from DRC hash function Commit 7142b98d9fd7 ("nfsd: Clean up drc cache in preparation for global spinlock elimination"), billed as a clean-up, added be32_to_cpu() to the DRC hash function without explanation. That commit removed two comments that state that byte-swapping in the hash function is unnecessary without explaining whether there was a need for that change. On some Intel CPUs, the swab32 instruction is known to cause a CPU pipeline stall. be32_to_cpu() does not add extra randomness, since the hash multiplication is done /before/ shifting to the high-order bits of the result. As a micro-optimization, remove the unnecessary transform from the DRC hash function. Signed-off-by: Chuck Lever --- fs/nfsd/nfscache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 6e0b6f3148dca..a4a69ab6ab280 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -87,7 +87,7 @@ nfsd_hashsize(unsigned int limit) static u32 nfsd_cache_hash(__be32 xid, struct nfsd_net *nn) { - return hash_32(be32_to_cpu(xid), nn->maskbits); + return hash_32((__force u32)xid, nn->maskbits); } static struct svc_cacherep * From 1e37d0e5bda45881eea1bec4b812def72c7d4aea Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Thu, 2 Dec 2021 16:35:42 +0800 Subject: [PATCH 23/51] NFSD: Fix inconsistent indenting Eliminate the follow smatch warning: fs/nfsd/nfs4xdr.c:4766 nfsd4_encode_read_plus_hole() warn: inconsistent indenting. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Signed-off-by: Chuck Lever --- fs/nfsd/nfs4xdr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 5a93a5db4fb0a..4a75a27956c1a 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -4804,8 +4804,8 @@ nfsd4_encode_read_plus_hole(struct nfsd4_compoundres *resp, return nfserr_resource; *p++ = htonl(NFS4_CONTENT_HOLE); - p = xdr_encode_hyper(p, read->rd_offset); - p = xdr_encode_hyper(p, count); + p = xdr_encode_hyper(p, read->rd_offset); + p = xdr_encode_hyper(p, count); *eof = (read->rd_offset + count) >= f_size; *maxcount = min_t(unsigned long, count, *maxcount); From 1463b38e7cf34d4cc60f41daff459ad807b2e408 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 1 Dec 2021 10:58:14 +1100 Subject: [PATCH 24/51] NFSD: simplify per-net file cache management We currently have a 'laundrette' for closing cached files - a different work-item for each network-namespace. These 'laundrettes' (aka struct nfsd_fcache_disposal) are currently on a list, and are freed using rcu. The list is not necessary as we have a per-namespace structure (struct nfsd_net) which can hold a link to the nfsd_fcache_disposal. The use of kfree_rcu is also unnecessary as the cache is cleaned of all files associated with a given namespace, and no new files can be added, before the nfsd_fcache_disposal is freed. So add a '->fcache_disposal' link to nfsd_net, and discard the list management and rcu usage. Signed-off-by: NeilBrown Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 76 +++++++++------------------------------------ fs/nfsd/netns.h | 2 ++ 2 files changed, 17 insertions(+), 61 deletions(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index fdf89fcf1a0ca..aa5dca498b27e 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -44,12 +44,9 @@ struct nfsd_fcache_bucket { static DEFINE_PER_CPU(unsigned long, nfsd_file_cache_hits); struct nfsd_fcache_disposal { - struct list_head list; struct work_struct work; - struct net *net; spinlock_t lock; struct list_head freeme; - struct rcu_head rcu; }; static struct workqueue_struct *nfsd_filecache_wq __read_mostly; @@ -62,8 +59,6 @@ static long nfsd_file_lru_flags; static struct fsnotify_group *nfsd_file_fsnotify_group; static atomic_long_t nfsd_filecache_count; static struct delayed_work nfsd_filecache_laundrette; -static DEFINE_SPINLOCK(laundrette_lock); -static LIST_HEAD(laundrettes); static void nfsd_file_gc(void); @@ -367,19 +362,13 @@ nfsd_file_list_remove_disposal(struct list_head *dst, static void nfsd_file_list_add_disposal(struct list_head *files, struct net *net) { - struct nfsd_fcache_disposal *l; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct nfsd_fcache_disposal *l = nn->fcache_disposal; - rcu_read_lock(); - list_for_each_entry_rcu(l, &laundrettes, list) { - if (l->net == net) { - spin_lock(&l->lock); - list_splice_tail_init(files, &l->freeme); - spin_unlock(&l->lock); - queue_work(nfsd_filecache_wq, &l->work); - break; - } - } - rcu_read_unlock(); + spin_lock(&l->lock); + list_splice_tail_init(files, &l->freeme); + spin_unlock(&l->lock); + queue_work(nfsd_filecache_wq, &l->work); } static void @@ -755,7 +744,7 @@ nfsd_file_cache_purge(struct net *net) } static struct nfsd_fcache_disposal * -nfsd_alloc_fcache_disposal(struct net *net) +nfsd_alloc_fcache_disposal(void) { struct nfsd_fcache_disposal *l; @@ -763,7 +752,6 @@ nfsd_alloc_fcache_disposal(struct net *net) if (!l) return NULL; INIT_WORK(&l->work, nfsd_file_delayed_close); - l->net = net; spin_lock_init(&l->lock); INIT_LIST_HEAD(&l->freeme); return l; @@ -772,61 +760,27 @@ nfsd_alloc_fcache_disposal(struct net *net) static void nfsd_free_fcache_disposal(struct nfsd_fcache_disposal *l) { - rcu_assign_pointer(l->net, NULL); cancel_work_sync(&l->work); nfsd_file_dispose_list(&l->freeme); - kfree_rcu(l, rcu); -} - -static void -nfsd_add_fcache_disposal(struct nfsd_fcache_disposal *l) -{ - spin_lock(&laundrette_lock); - list_add_tail_rcu(&l->list, &laundrettes); - spin_unlock(&laundrette_lock); -} - -static void -nfsd_del_fcache_disposal(struct nfsd_fcache_disposal *l) -{ - spin_lock(&laundrette_lock); - list_del_rcu(&l->list); - spin_unlock(&laundrette_lock); -} - -static int -nfsd_alloc_fcache_disposal_net(struct net *net) -{ - struct nfsd_fcache_disposal *l; - - l = nfsd_alloc_fcache_disposal(net); - if (!l) - return -ENOMEM; - nfsd_add_fcache_disposal(l); - return 0; + kfree(l); } static void nfsd_free_fcache_disposal_net(struct net *net) { - struct nfsd_fcache_disposal *l; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct nfsd_fcache_disposal *l = nn->fcache_disposal; - rcu_read_lock(); - list_for_each_entry_rcu(l, &laundrettes, list) { - if (l->net != net) - continue; - nfsd_del_fcache_disposal(l); - rcu_read_unlock(); - nfsd_free_fcache_disposal(l); - return; - } - rcu_read_unlock(); + nfsd_free_fcache_disposal(l); } int nfsd_file_cache_start_net(struct net *net) { - return nfsd_alloc_fcache_disposal_net(net); + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + nn->fcache_disposal = nfsd_alloc_fcache_disposal(); + return nn->fcache_disposal ? 0 : -ENOMEM; } void diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 021acdc0d03bb..9e8b77d2a3a47 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -185,6 +185,8 @@ struct nfsd_net { /* utsname taken from the process that starts the server */ char nfsd_name[UNX_MAXNODENAME+1]; + + struct nfsd_fcache_disposal *fcache_disposal; }; /* Simple check to find out if a given net was properly initialized */ From 5089f3d97552b0b07101e02a3fca0146b9b9d3b5 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 19 Oct 2021 15:17:21 -0400 Subject: [PATCH 25/51] SUNRPC: Remove low signal-to-noise tracepoints I'm about to add more information to the server-side SUNRPC tracepoints, so I'm going to offset the increased trace log consumption by getting rid of some tracepoints that fire frequently but don't offer much value. trace_svc_xprt_received() was useful for debugging, perhaps, but is not generally informative. trace_svc_handle_xprt() reports largely the same information as trace_svc_xdr_recvfrom(). As a clean-up, rename trace_svc_xprt_do_enqueue() to match svc_xprt_dequeue(). Signed-off-by: Chuck Lever --- include/trace/events/sunrpc.h | 24 +----------------------- net/sunrpc/svc_xprt.c | 6 ++---- 2 files changed, 3 insertions(+), 27 deletions(-) diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 3a99358c262b4..684cc0e322fa3 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -1768,7 +1768,7 @@ TRACE_EVENT(svc_xprt_create_err, __entry->error) ); -TRACE_EVENT(svc_xprt_do_enqueue, +TRACE_EVENT(svc_xprt_enqueue, TP_PROTO(struct svc_xprt *xprt, struct svc_rqst *rqst), TP_ARGS(xprt, rqst), @@ -1815,7 +1815,6 @@ DECLARE_EVENT_CLASS(svc_xprt_event, ), \ TP_ARGS(xprt)) -DEFINE_SVC_XPRT_EVENT(received); DEFINE_SVC_XPRT_EVENT(no_write_space); DEFINE_SVC_XPRT_EVENT(close); DEFINE_SVC_XPRT_EVENT(detach); @@ -1902,27 +1901,6 @@ TRACE_EVENT(svc_alloc_arg_err, TP_printk("pages=%u", __entry->pages) ); -TRACE_EVENT(svc_handle_xprt, - TP_PROTO(struct svc_xprt *xprt, int len), - - TP_ARGS(xprt, len), - - TP_STRUCT__entry( - __field(int, len) - __field(unsigned long, flags) - __string(addr, xprt->xpt_remotebuf) - ), - - TP_fast_assign( - __entry->len = len; - __entry->flags = xprt->xpt_flags; - __assign_str(addr, xprt->xpt_remotebuf); - ), - - TP_printk("addr=%s len=%d flags=%s", __get_str(addr), - __entry->len, show_svc_xprt_flags(__entry->flags)) -); - TRACE_EVENT(svc_stats_latency, TP_PROTO(const struct svc_rqst *rqst), diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 1e99ba1b9d723..b1744432489e3 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -264,8 +264,6 @@ void svc_xprt_received(struct svc_xprt *xprt) return; } - trace_svc_xprt_received(xprt); - /* As soon as we clear busy, the xprt could be closed and * 'put', so we need a reference to call svc_enqueue_xprt with: */ @@ -466,7 +464,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt) out_unlock: rcu_read_unlock(); put_cpu(); - trace_svc_xprt_do_enqueue(xprt, rqstp); + trace_svc_xprt_enqueue(xprt, rqstp); } EXPORT_SYMBOL_GPL(svc_xprt_do_enqueue); @@ -842,8 +840,8 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt) atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved); } else svc_xprt_received(xprt); + out: - trace_svc_handle_xprt(xprt, len); return len; } From 70e94d757b3e1f46486d573729d84c8955c81dce Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 21 Oct 2021 12:11:45 -0400 Subject: [PATCH 26/51] NFSD: Combine XDR error tracepoints Clean up: The garbage_args and cant_encode tracepoints report the same information as each other, so combine them into a single tracepoint class to reduce code duplication and slightly reduce the size of trace.o. Signed-off-by: Chuck Lever --- fs/nfsd/trace.h | 28 +++++++--------------------- 1 file changed, 7 insertions(+), 21 deletions(-) diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index f1e0d3c51bc23..6afb320931042 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -47,7 +47,7 @@ rqstp->rq_xprt->xpt_remotelen); \ } while (0); -TRACE_EVENT(nfsd_garbage_args_err, +DECLARE_EVENT_CLASS(nfsd_xdr_err_class, TP_PROTO( const struct svc_rqst *rqstp ), @@ -69,27 +69,13 @@ TRACE_EVENT(nfsd_garbage_args_err, ) ); -TRACE_EVENT(nfsd_cant_encode_err, - TP_PROTO( - const struct svc_rqst *rqstp - ), - TP_ARGS(rqstp), - TP_STRUCT__entry( - NFSD_TRACE_PROC_ARG_FIELDS +#define DEFINE_NFSD_XDR_ERR_EVENT(name) \ +DEFINE_EVENT(nfsd_xdr_err_class, nfsd_##name##_err, \ + TP_PROTO(const struct svc_rqst *rqstp), \ + TP_ARGS(rqstp)) - __field(u32, vers) - __field(u32, proc) - ), - TP_fast_assign( - NFSD_TRACE_PROC_ARG_ASSIGNMENTS - - __entry->vers = rqstp->rq_vers; - __entry->proc = rqstp->rq_proc; - ), - TP_printk("xid=0x%08x vers=%u proc=%u", - __entry->xid, __entry->vers, __entry->proc - ) -); +DEFINE_NFSD_XDR_ERR_EVENT(garbage_args); +DEFINE_NFSD_XDR_ERR_EVENT(cant_encode); #define show_nfsd_may_flags(x) \ __print_flags(x, "|", \ From 3dcd1d8aab00c5d3a0a3725253c86440b1a0f5a7 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 7 Dec 2021 17:32:21 -0500 Subject: [PATCH 27/51] nfsd: improve stateid access bitmask documentation The use of the bitmaps is confusing. Add a cross-reference to make it easier to find the existing comment. Add an updated reference with URL to make it quicker to look up. And a bit more editorializing about the value of this. Signed-off-by: J. Bruce Fields Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 14 ++++++++++---- fs/nfsd/state.h | 4 ++++ 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 1956d377d1a60..72e3833c30349 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -360,11 +360,13 @@ static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops = { * st_{access,deny}_bmap field of the stateid, in order to track not * only what share bits are currently in force, but also what * combinations of share bits previous opens have used. This allows us - * to enforce the recommendation of rfc 3530 14.2.19 that the server - * return an error if the client attempt to downgrade to a combination - * of share bits not explicable by closing some of its previous opens. + * to enforce the recommendation in + * https://datatracker.ietf.org/doc/html/rfc7530#section-16.19.4 that + * the server return an error if the client attempt to downgrade to a + * combination of share bits not explicable by closing some of its + * previous opens. * - * XXX: This enforcement is actually incomplete, since we don't keep + * This enforcement is arguably incomplete, since we don't keep * track of access/deny bit combinations; so, e.g., we allow: * * OPEN allow read, deny write @@ -372,6 +374,10 @@ static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops = { * DOWNGRADE allow read, deny none * * which we should reject. + * + * But you could also argue that our current code is already overkill, + * since it only exists to return NFS4ERR_INVAL on incorrect client + * behavior. */ static unsigned int bmap_to_share_mode(unsigned long bmap) diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index e73bdbb1634ab..6eb3c7157214b 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -568,6 +568,10 @@ struct nfs4_ol_stateid { struct list_head st_locks; struct nfs4_stateowner *st_stateowner; struct nfs4_clnt_odstate *st_clnt_odstate; +/* + * These bitmasks use 3 separate bits for READ, ALLOW, and BOTH; see the + * comment above bmap_to_share_mode() for explanation: + */ unsigned char st_access_bmap; unsigned char st_deny_bmap; struct nfs4_ol_stateid *st_openstp; From cd2e999c7c394ae916d8be741418b3c6c1dddea8 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 13 Dec 2021 10:20:45 -0500 Subject: [PATCH 28/51] NFSD: De-duplicate nfsd4_decode_bitmap4() Clean up. Trond points out that xdr_stream_decode_uint32_array() does the same thing as nfsd4_decode_bitmap4(). Suggested-by: Trond Myklebust Signed-off-by: Chuck Lever --- fs/nfsd/nfs4xdr.c | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 4a75a27956c1a..899de438e5290 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -277,21 +277,10 @@ nfsd4_decode_verifier4(struct nfsd4_compoundargs *argp, nfs4_verifier *verf) static __be32 nfsd4_decode_bitmap4(struct nfsd4_compoundargs *argp, u32 *bmval, u32 bmlen) { - u32 i, count; - __be32 *p; - - if (xdr_stream_decode_u32(argp->xdr, &count) < 0) - return nfserr_bad_xdr; - /* request sanity */ - if (count > 1000) - return nfserr_bad_xdr; - p = xdr_inline_decode(argp->xdr, count << 2); - if (!p) - return nfserr_bad_xdr; - for (i = 0; i < bmlen; i++) - bmval[i] = (i < count) ? be32_to_cpup(p++) : 0; + ssize_t status; - return nfs_ok; + status = xdr_stream_decode_uint32_array(argp->xdr, bmval, bmlen); + return status == -EBADMSG ? nfserr_bad_xdr : nfs_ok; } static __be32 From 40595cdc93edf4110c0f0c0b06f8d82008f23929 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 16 Dec 2021 12:20:13 -0500 Subject: [PATCH 29/51] nfs: block notification on fs with its own ->lock NFSv4.1 supports an optional lock notification feature which notifies the client when a lock comes available. (Normally NFSv4 clients just poll for locks if necessary.) To make that work, we need to request a blocking lock from the filesystem. We turned that off for NFS in commit f657f8eef3ff ("nfs: don't atempt blocking locks on nfs reexports") [sic] because it actually blocks the nfsd thread while waiting for the lock. Thanks to Vasily Averin for pointing out that NFS isn't the only filesystem with that problem. Any filesystem that leaves ->lock NULL will use posix_lock_file(), which does the right thing. Simplest is just to assume that any filesystem that defines its own ->lock is not safe to request a blocking lock from. So, this patch mostly reverts commit f657f8eef3ff ("nfs: don't atempt blocking locks on nfs reexports") [sic] and commit b840be2f00c0 ("lockd: don't attempt blocking locks on nfs reexports"), and instead uses a check of ->lock (Vasily's suggestion) to decide whether to support blocking lock notifications on a given filesystem. Also add a little documentation. Perhaps someday we could add back an export flag later to allow filesystems with "good" ->lock methods to support blocking lock notifications. Reported-by: Vasily Averin Signed-off-by: J. Bruce Fields [ cel: Description rewritten to address checkpatch nits ] [ cel: Fixed warning when SUNRPC debugging is disabled ] [ cel: Fixed NULL check ] Signed-off-by: Chuck Lever Reviewed-by: Vasily Averin --- fs/lockd/svclock.c | 6 ++++-- fs/nfs/export.c | 2 +- fs/nfsd/nfs4state.c | 18 ++++++++++++------ include/linux/exportfs.h | 2 -- include/linux/lockd/lockd.h | 9 +++++++-- 5 files changed, 24 insertions(+), 13 deletions(-) diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index e9b85d8fd5fe7..cb3658ab9b7ae 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -470,8 +470,10 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, struct nlm_host *host, struct nlm_lock *lock, int wait, struct nlm_cookie *cookie, int reclaim) { - struct nlm_block *block = NULL; +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) struct inode *inode = nlmsvc_file_inode(file); +#endif + struct nlm_block *block = NULL; int error; int mode; int async_block = 0; @@ -484,7 +486,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, (long long)lock->fl.fl_end, wait); - if (inode->i_sb->s_export_op->flags & EXPORT_OP_SYNC_LOCKS) { + if (nlmsvc_file_file(file)->f_op->lock) { async_block = wait; wait = 0; } diff --git a/fs/nfs/export.c b/fs/nfs/export.c index 171c424cb6d53..01596f2d0a1ed 100644 --- a/fs/nfs/export.c +++ b/fs/nfs/export.c @@ -158,5 +158,5 @@ const struct export_operations nfs_export_ops = { .fetch_iversion = nfs_fetch_iversion, .flags = EXPORT_OP_NOWCC|EXPORT_OP_NOSUBTREECHK| EXPORT_OP_CLOSE_BEFORE_UNLINK|EXPORT_OP_REMOTE_FS| - EXPORT_OP_NOATOMIC_ATTR|EXPORT_OP_SYNC_LOCKS, + EXPORT_OP_NOATOMIC_ATTR, }; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 72e3833c30349..d8faccc554798 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -6842,7 +6842,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_blocked_lock *nbl = NULL; struct file_lock *file_lock = NULL; struct file_lock *conflock = NULL; - struct super_block *sb; __be32 status = 0; int lkflg; int err; @@ -6864,7 +6863,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, dprintk("NFSD: nfsd4_lock: permission denied!\n"); return status; } - sb = cstate->current_fh.fh_dentry->d_sb; if (lock->lk_is_new) { if (nfsd4_has_session(cstate)) @@ -6916,8 +6914,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, fp = lock_stp->st_stid.sc_file; switch (lock->lk_type) { case NFS4_READW_LT: - if (nfsd4_has_session(cstate) && - !(sb->s_export_op->flags & EXPORT_OP_SYNC_LOCKS)) + if (nfsd4_has_session(cstate)) fl_flags |= FL_SLEEP; fallthrough; case NFS4_READ_LT: @@ -6929,8 +6926,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, fl_type = F_RDLCK; break; case NFS4_WRITEW_LT: - if (nfsd4_has_session(cstate) && - !(sb->s_export_op->flags & EXPORT_OP_SYNC_LOCKS)) + if (nfsd4_has_session(cstate)) fl_flags |= FL_SLEEP; fallthrough; case NFS4_WRITE_LT: @@ -6951,6 +6947,16 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; } + /* + * Most filesystems with their own ->lock operations will block + * the nfsd thread waiting to acquire the lock. That leads to + * deadlocks (we don't want every nfsd thread tied up waiting + * for file locks), so don't attempt blocking lock notifications + * on those filesystems: + */ + if (nf->nf_file->f_op->lock) + fl_flags &= ~FL_SLEEP; + nbl = find_or_allocate_block(lock_sop, &fp->fi_fhandle, nn); if (!nbl) { dprintk("NFSD: %s: unable to allocate block!\n", __func__); diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index 3260fe7148462..fe848901fcc3a 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -221,8 +221,6 @@ struct export_operations { #define EXPORT_OP_NOATOMIC_ATTR (0x10) /* Filesystem cannot supply atomic attribute updates */ -#define EXPORT_OP_SYNC_LOCKS (0x20) /* Filesystem can't do - asychronous blocking locks */ unsigned long flags; }; diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index c4ae6506b8b36..fcef192e5e45e 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -303,10 +303,15 @@ void nlmsvc_invalidate_all(void); int nlmsvc_unlock_all_by_sb(struct super_block *sb); int nlmsvc_unlock_all_by_ip(struct sockaddr *server_addr); +static inline struct file *nlmsvc_file_file(struct nlm_file *file) +{ + return file->f_file[O_RDONLY] ? + file->f_file[O_RDONLY] : file->f_file[O_WRONLY]; +} + static inline struct inode *nlmsvc_file_inode(struct nlm_file *file) { - return locks_inode(file->f_file[O_RDONLY] ? - file->f_file[O_RDONLY] : file->f_file[O_WRONLY]); + return locks_inode(nlmsvc_file_file(file)); } static inline int __nlm_privileged_request4(const struct sockaddr *sap) From 47446d74f1707049067fee038507cdffda805631 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Fri, 17 Dec 2021 09:49:39 +0300 Subject: [PATCH 30/51] nfsd4: add refcount for nfsd4_blocked_lock nbl allocated in nfsd4_lock can be released by a several ways: directly in nfsd4_lock(), via nfs4_laundromat(), via another nfs command RELEASE_LOCKOWNER or via nfsd4_callback. This structure should be refcounted to be used and released correctly in all these cases. Refcount is initialized to 1 during allocation and is incremented when nbl is added into nbl_list/nbl_lru lists. Usually nbl is linked into both lists together, so only one refcount is used for both lists. However nfsd4_lock() should keep in mind that nbl can be present in one of lists only. This can happen if nbl was handled already by nfs4_laundromat/nfsd4_callback/etc. Refcount is decremented if vfs_lock_file() returns FILE_LOCK_DEFERRED, because nbl can be handled already by nfs4_laundromat/nfsd4_callback/etc. Refcount is not changed in find_blocked_lock() because of it reuses counter released after removing nbl from lists. Signed-off-by: Vasily Averin Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 25 ++++++++++++++++++++++--- fs/nfsd/state.h | 1 + 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index d8faccc554798..eb920f855969f 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -246,6 +246,7 @@ find_blocked_lock(struct nfs4_lockowner *lo, struct knfsd_fh *fh, list_for_each_entry(cur, &lo->lo_blocked, nbl_list) { if (fh_match(fh, &cur->nbl_fh)) { list_del_init(&cur->nbl_list); + WARN_ON(list_empty(&cur->nbl_lru)); list_del_init(&cur->nbl_lru); found = cur; break; @@ -271,6 +272,7 @@ find_or_allocate_block(struct nfs4_lockowner *lo, struct knfsd_fh *fh, INIT_LIST_HEAD(&nbl->nbl_lru); fh_copy_shallow(&nbl->nbl_fh, fh); locks_init_lock(&nbl->nbl_lock); + kref_init(&nbl->nbl_kref); nfsd4_init_cb(&nbl->nbl_cb, lo->lo_owner.so_client, &nfsd4_cb_notify_lock_ops, NFSPROC4_CLNT_CB_NOTIFY_LOCK); @@ -279,12 +281,21 @@ find_or_allocate_block(struct nfs4_lockowner *lo, struct knfsd_fh *fh, return nbl; } +static void +free_nbl(struct kref *kref) +{ + struct nfsd4_blocked_lock *nbl; + + nbl = container_of(kref, struct nfsd4_blocked_lock, nbl_kref); + kfree(nbl); +} + static void free_blocked_lock(struct nfsd4_blocked_lock *nbl) { locks_delete_block(&nbl->nbl_lock); locks_release_private(&nbl->nbl_lock); - kfree(nbl); + kref_put(&nbl->nbl_kref, free_nbl); } static void @@ -302,6 +313,7 @@ remove_blocked_locks(struct nfs4_lockowner *lo) struct nfsd4_blocked_lock, nbl_list); list_del_init(&nbl->nbl_list); + WARN_ON(list_empty(&nbl->nbl_lru)); list_move(&nbl->nbl_lru, &reaplist); } spin_unlock(&nn->blocked_locks_lock); @@ -6987,6 +6999,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, spin_lock(&nn->blocked_locks_lock); list_add_tail(&nbl->nbl_list, &lock_sop->lo_blocked); list_add_tail(&nbl->nbl_lru, &nn->blocked_locks_lru); + kref_get(&nbl->nbl_kref); spin_unlock(&nn->blocked_locks_lock); } @@ -6999,6 +7012,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nn->somebody_reclaimed = true; break; case FILE_LOCK_DEFERRED: + kref_put(&nbl->nbl_kref, free_nbl); nbl = NULL; fallthrough; case -EAGAIN: /* conflock holds conflicting lock */ @@ -7019,8 +7033,13 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, /* dequeue it if we queued it before */ if (fl_flags & FL_SLEEP) { spin_lock(&nn->blocked_locks_lock); - list_del_init(&nbl->nbl_list); - list_del_init(&nbl->nbl_lru); + if (!list_empty(&nbl->nbl_list) && + !list_empty(&nbl->nbl_lru)) { + list_del_init(&nbl->nbl_list); + list_del_init(&nbl->nbl_lru); + kref_put(&nbl->nbl_kref, free_nbl); + } + /* nbl can use one of lists to be linked to reaplist */ spin_unlock(&nn->blocked_locks_lock); } free_blocked_lock(nbl); diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 6eb3c7157214b..95457cfd37fc0 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -633,6 +633,7 @@ struct nfsd4_blocked_lock { struct file_lock nbl_lock; struct knfsd_fh nbl_fh; struct nfsd4_callback nbl_cb; + struct kref nbl_kref; }; struct nfsd4_compound_state; From 6a2f774424bfdcc2df3e17de0cefe74a4269cad5 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 21 Dec 2021 11:52:06 -0500 Subject: [PATCH 31/51] NFSD: Fix zero-length NFSv3 WRITEs The Linux NFS server currently responds to a zero-length NFSv3 WRITE request with NFS3ERR_IO. It responds to a zero-length NFSv4 WRITE with NFS4_OK and count of zero. RFC 1813 says of the WRITE procedure's @count argument: count The number of bytes of data to be written. If count is 0, the WRITE will succeed and return a count of 0, barring errors due to permissions checking. RFC 8881 has similar language for NFSv4, though NFSv4 removed the explicit @count argument because that value is already contained in the opaque payload array. The synthetic client pynfs's WRT4 and WRT15 tests do emit zero- length WRITEs to exercise this spec requirement. Commit fdec6114ee1f ("nfsd4: zero-length WRITE should succeed") addressed the same problem there with the same fix. But interestingly the Linux NFS client does not appear to emit zero- length WRITEs, instead squelching them. I'm not aware of a test that can generate such WRITEs for NFSv3, so I wrote a naive C program to generate a zero-length WRITE and test this fix. Fixes: 8154ef2776aa ("NFSD: Clean up legacy NFS WRITE argument XDR decoders") Reported-by: Trond Myklebust Signed-off-by: Chuck Lever Cc: stable@vger.kernel.org Signed-off-by: Chuck Lever --- fs/nfsd/nfs3proc.c | 6 +----- fs/nfsd/nfsproc.c | 5 ----- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 4418517f6f120..2c681785186f7 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -202,15 +202,11 @@ nfsd3_proc_write(struct svc_rqst *rqstp) fh_copy(&resp->fh, &argp->fh); resp->committed = argp->stable; nvecs = svc_fill_write_vector(rqstp, &argp->payload); - if (!nvecs) { - resp->status = nfserr_io; - goto out; - } + resp->status = nfsd_write(rqstp, &resp->fh, argp->offset, rqstp->rq_vec, nvecs, &cnt, resp->committed, resp->verf); resp->count = cnt; -out: return rpc_success; } diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index eea5b59b6a6ca..1743ed04197e8 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -235,10 +235,6 @@ nfsd_proc_write(struct svc_rqst *rqstp) argp->len, argp->offset); nvecs = svc_fill_write_vector(rqstp, &argp->payload); - if (!nvecs) { - resp->status = nfserr_io; - goto out; - } resp->status = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), argp->offset, rqstp->rq_vec, nvecs, @@ -247,7 +243,6 @@ nfsd_proc_write(struct svc_rqst *rqstp) resp->status = fh_getattr(&resp->fh, &resp->stat); else if (resp->status == nfserr_jukebox) return rpc_drop_reply; -out: return rpc_success; } From b3d0db706c77d02055910fcfe2f6eb5155ff9d5e Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Sat, 18 Dec 2021 20:37:54 -0500 Subject: [PATCH 32/51] nfsd: map EBADF Now that we have open file cache, it is possible that another client deletes the file and DP will not know about it. Then IO to MDS would fail with BADSTATEID and knfsd would start state recovery, which should fail as well and then nfs read/write will fail with EBADF. And it triggers a WARN() in nfserrno(). -----------[ cut here ]------------ WARNING: CPU: 0 PID: 13529 at fs/nfsd/nfsproc.c:758 nfserrno+0x58/0x70 [nfsd]() nfsd: non-standard errno: -9 modules linked in: nfsv3 nfs_layout_flexfiles rpcsec_gss_krb5 nfsv4 dns_resolver nfs fscache ip6t_rpfilter ip6t_REJECT nf_reject_ipv6 xt_connt pata_acpi floppy CPU: 0 PID: 13529 Comm: nfsd Tainted: G W 4.1.5-00307-g6e6579b #7 Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 09/30/2014 0000000000000000 00000000464e6c9c ffff88079085fba8 ffffffff81789936 0000000000000000 ffff88079085fc00 ffff88079085fbe8 ffffffff810a08ea ffff88079085fbe8 ffff88080f45c900 ffff88080f627d50 ffff880790c46a48 all Trace: [] dump_stack+0x45/0x57 [] warn_slowpath_common+0x8a/0xc0 [] warn_slowpath_fmt+0x55/0x70 [] ? splice_direct_to_actor+0x148/0x230 [] ? fsid_source+0x60/0x60 [nfsd] [] nfserrno+0x58/0x70 [nfsd] [] nfsd_finish_read+0x97/0xb0 [nfsd] [] nfsd_splice_read+0x76/0xa0 [nfsd] [] nfsd_read+0xc1/0xd0 [nfsd] [] ? svc_tcp_adjust_wspace+0x12/0x30 [sunrpc] [] nfsd3_proc_read+0xba/0x150 [nfsd] [] nfsd_dispatch+0xc3/0x210 [nfsd] [] ? svc_tcp_adjust_wspace+0x12/0x30 [sunrpc] [] svc_process_common+0x453/0x6f0 [sunrpc] [] svc_process+0x113/0x1b0 [sunrpc] [] nfsd+0xff/0x170 [nfsd] [] ? nfsd_destroy+0x80/0x80 [nfsd] [] kthread+0xd8/0xf0 [] ? kthread_create_on_node+0x1b0/0x1b0 [] ret_from_fork+0x42/0x70 [] ? kthread_create_on_node+0x1b0/0x1b0 Signed-off-by: Peng Tao Signed-off-by: Lance Shelton Signed-off-by: Trond Myklebust Signed-off-by: Chuck Lever --- fs/nfsd/nfsproc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 1743ed04197e8..54436919be8cb 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -845,6 +845,7 @@ nfserrno (int errno) { nfserr_io, -EIO }, { nfserr_nxio, -ENXIO }, { nfserr_fbig, -E2BIG }, + { nfserr_stale, -EBADF }, { nfserr_acces, -EACCES }, { nfserr_exist, -EEXIST }, { nfserr_xdev, -EXDEV }, From a2694e51f60c5a18c7e43d1a9feaa46d7f153e65 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sat, 18 Dec 2021 20:37:55 -0500 Subject: [PATCH 33/51] nfsd: Add errno mapping for EREMOTEIO The NFS client can occasionally return EREMOTEIO when signalling issues with the server. ...map to NFSERR_IO. Signed-off-by: Jeff Layton Signed-off-by: Lance Shelton Signed-off-by: Trond Myklebust Signed-off-by: Chuck Lever --- fs/nfsd/nfsproc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 54436919be8cb..da0414ecf4d27 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -874,6 +874,7 @@ nfserrno (int errno) { nfserr_toosmall, -ETOOSMALL }, { nfserr_serverfault, -ESERVERFAULT }, { nfserr_serverfault, -ENFILE }, + { nfserr_io, -EREMOTEIO }, { nfserr_io, -EUCLEAN }, { nfserr_perm, -ENOKEY }, { nfserr_no_grace, -ENOGRACE}, From 12bcbd40fd931472c7fc9cf3bfe66799ece93ed8 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sat, 18 Dec 2021 20:37:56 -0500 Subject: [PATCH 34/51] nfsd: Retry once in nfsd_open on an -EOPENSTALE return If we get back -EOPENSTALE from an NFSv4 open, then we either got some unhandled error or the inode we got back was not the same as the one associated with the dentry. We really have no recourse in that situation other than to retry the open, and if it fails to just return nfserr_stale back to the client. Signed-off-by: Jeff Layton Signed-off-by: Lance Shelton Signed-off-by: Trond Myklebust Signed-off-by: Chuck Lever --- fs/nfsd/nfsproc.c | 1 + fs/nfsd/vfs.c | 10 +++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index da0414ecf4d27..48c7344151df3 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -875,6 +875,7 @@ nfserrno (int errno) { nfserr_serverfault, -ESERVERFAULT }, { nfserr_serverfault, -ENFILE }, { nfserr_io, -EREMOTEIO }, + { nfserr_stale, -EOPENSTALE }, { nfserr_io, -EUCLEAN }, { nfserr_perm, -ENOKEY }, { nfserr_no_grace, -ENOGRACE}, diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index c99857689e2c2..0faa3839ea6cd 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -777,6 +777,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int may_flags, struct file **filp) { __be32 err; + bool retried = false; validate_process_creds(); /* @@ -792,9 +793,16 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, */ if (type == S_IFREG) may_flags |= NFSD_MAY_OWNER_OVERRIDE; +retry: err = fh_verify(rqstp, fhp, type, may_flags); - if (!err) + if (!err) { err = __nfsd_open(rqstp, fhp, type, may_flags, filp); + if (err == nfserr_stale && !retried) { + retried = true; + fh_put(fhp); + goto retry; + } + } validate_process_creds(); return err; } From f11ad7aa653130b71e2e89bed207f387718216d5 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 28 Dec 2021 12:35:43 -0500 Subject: [PATCH 35/51] NFSD: Fix verifier returned in stable WRITEs RFC 8881 explains the purpose of the write verifier this way: > The final portion of the result is the field writeverf. This field > is the write verifier and is a cookie that the client can use to > determine whether a server has changed instance state (e.g., server > restart) between a call to WRITE and a subsequent call to either > WRITE or COMMIT. But then it says: > This cookie MUST be unchanged during a single instance of the > NFSv4.1 server and MUST be unique between instances of the NFSv4.1 > server. If the cookie changes, then the client MUST assume that > any data written with an UNSTABLE4 value for committed and an old > writeverf in the reply has been lost and will need to be > recovered. RFC 1813 has similar language for NFSv3. NFSv2 does not have a write verifier since it doesn't implement the COMMIT procedure. Since commit 19e0663ff9bc ("nfsd: Ensure sampling of the write verifier is atomic with the write"), the Linux NFS server has returned a boot-time-based verifier for UNSTABLE WRITEs, but a zero verifier for FILE_SYNC and DATA_SYNC WRITEs. FILE_SYNC and DATA_SYNC WRITEs are not followed up with a COMMIT, so there's no need for clients to compare verifiers for stable writes. However, by returning a different verifier for stable and unstable writes, the above commit puts the Linux NFS server a step farther out of compliance with the first MUST above. At least one NFS client (FreeBSD) noticed the difference, making this a potential regression. Reported-by: Rick Macklem Link: https://lore.kernel.org/linux-nfs/YQXPR0101MB096857EEACF04A6DF1FC6D9BDD749@YQXPR0101MB0968.CANPRD01.PROD.OUTLOOK.COM/T/ Fixes: 19e0663ff9bc ("nfsd: Ensure sampling of the write verifier is atomic with the write") Signed-off-by: Chuck Lever --- fs/nfsd/vfs.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 0faa3839ea6cd..74c3451c20898 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -995,6 +995,10 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, iov_iter_kvec(&iter, WRITE, vec, vlen, *cnt); if (flags & RWF_SYNC) { down_write(&nf->nf_rwsem); + if (verf) + nfsd_copy_boot_verifier(verf, + net_generic(SVC_NET(rqstp), + nfsd_net_id)); host_err = vfs_iter_write(file, &iter, &pos, flags); if (host_err < 0) nfsd_reset_boot_verifier(net_generic(SVC_NET(rqstp), From 555dbf1a9aac6d3150c8b52fa35f768a692f4eeb Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 18 Dec 2021 20:38:01 -0500 Subject: [PATCH 36/51] nfsd: Replace use of rwsem with errseq_t The nfsd_file nf_rwsem is currently being used to separate file write and commit instances to ensure that we catch errors and apply them to the correct write/commit. We can improve scalability at the expense of a little accuracy (some extra false positives) by replacing the nf_rwsem with more careful use of the errseq_t mechanism to track errors across the different operations. Signed-off-by: Trond Myklebust Signed-off-by: Chuck Lever [ cel: rebased on zero-verifier fix ] --- fs/nfsd/filecache.c | 1 - fs/nfsd/filecache.h | 1 - fs/nfsd/nfs4proc.c | 16 +++++++++------- fs/nfsd/vfs.c | 40 +++++++++++++++------------------------- 4 files changed, 24 insertions(+), 34 deletions(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index aa5dca498b27e..e2904540e463c 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -189,7 +189,6 @@ nfsd_file_alloc(struct inode *inode, unsigned int may, unsigned int hashval, __set_bit(NFSD_FILE_BREAK_READ, &nf->nf_flags); } nf->nf_mark = NULL; - init_rwsem(&nf->nf_rwsem); trace_nfsd_file_alloc(nf); } return nf; diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h index 7872df5a0fe3a..435ceab27897a 100644 --- a/fs/nfsd/filecache.h +++ b/fs/nfsd/filecache.h @@ -46,7 +46,6 @@ struct nfsd_file { refcount_t nf_ref; unsigned char nf_may; struct nfsd_file_mark *nf_mark; - struct rw_semaphore nf_rwsem; }; int nfsd_file_cache_init(void); diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index a6dc5e18c498c..56405fc58bfcc 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1510,6 +1510,9 @@ static void nfsd4_init_copy_res(struct nfsd4_copy *copy, bool sync) static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy) { + struct file *dst = copy->nf_dst->nf_file; + struct file *src = copy->nf_src->nf_file; + errseq_t since; ssize_t bytes_copied = 0; u64 bytes_total = copy->cp_count; u64 src_pos = copy->cp_src_pos; @@ -1522,9 +1525,8 @@ static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy) do { if (kthread_should_stop()) break; - bytes_copied = nfsd_copy_file_range(copy->nf_src->nf_file, - src_pos, copy->nf_dst->nf_file, dst_pos, - bytes_total); + bytes_copied = nfsd_copy_file_range(src, src_pos, dst, dst_pos, + bytes_total); if (bytes_copied <= 0) break; bytes_total -= bytes_copied; @@ -1534,11 +1536,11 @@ static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy) } while (bytes_total > 0 && !copy->cp_synchronous); /* for a non-zero asynchronous copy do a commit of data */ if (!copy->cp_synchronous && copy->cp_res.wr_bytes_written > 0) { - down_write(©->nf_dst->nf_rwsem); - status = vfs_fsync_range(copy->nf_dst->nf_file, - copy->cp_dst_pos, + since = READ_ONCE(dst->f_wb_err); + status = vfs_fsync_range(dst, copy->cp_dst_pos, copy->cp_res.wr_bytes_written, 0); - up_write(©->nf_dst->nf_rwsem); + if (!status) + status = filemap_check_wb_err(dst->f_mapping, since); if (!status) copy->committed = true; } diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 74c3451c20898..316ed702d518c 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -522,10 +522,11 @@ __be32 nfsd4_clone_file_range(struct nfsd_file *nf_src, u64 src_pos, { struct file *src = nf_src->nf_file; struct file *dst = nf_dst->nf_file; + errseq_t since; loff_t cloned; __be32 ret = 0; - down_write(&nf_dst->nf_rwsem); + since = READ_ONCE(dst->f_wb_err); cloned = vfs_clone_file_range(src, src_pos, dst, dst_pos, count, 0); if (cloned < 0) { ret = nfserrno(cloned); @@ -539,6 +540,8 @@ __be32 nfsd4_clone_file_range(struct nfsd_file *nf_src, u64 src_pos, loff_t dst_end = count ? dst_pos + count - 1 : LLONG_MAX; int status = vfs_fsync_range(dst, dst_pos, dst_end, 0); + if (!status) + status = filemap_check_wb_err(dst->f_mapping, since); if (!status) status = commit_inode_metadata(file_inode(src)); if (status < 0) { @@ -548,7 +551,6 @@ __be32 nfsd4_clone_file_range(struct nfsd_file *nf_src, u64 src_pos, } } out_err: - up_write(&nf_dst->nf_rwsem); return ret; } @@ -956,6 +958,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, struct super_block *sb = file_inode(file)->i_sb; struct svc_export *exp; struct iov_iter iter; + errseq_t since; __be32 nfserr; int host_err; int use_wgather; @@ -993,8 +996,8 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, flags |= RWF_SYNC; iov_iter_kvec(&iter, WRITE, vec, vlen, *cnt); + since = READ_ONCE(file->f_wb_err); if (flags & RWF_SYNC) { - down_write(&nf->nf_rwsem); if (verf) nfsd_copy_boot_verifier(verf, net_generic(SVC_NET(rqstp), @@ -1003,15 +1006,12 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, if (host_err < 0) nfsd_reset_boot_verifier(net_generic(SVC_NET(rqstp), nfsd_net_id)); - up_write(&nf->nf_rwsem); } else { - down_read(&nf->nf_rwsem); if (verf) nfsd_copy_boot_verifier(verf, net_generic(SVC_NET(rqstp), nfsd_net_id)); host_err = vfs_iter_write(file, &iter, &pos, flags); - up_read(&nf->nf_rwsem); } if (host_err < 0) { nfsd_reset_boot_verifier(net_generic(SVC_NET(rqstp), @@ -1021,6 +1021,9 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, *cnt = host_err; nfsd_stats_io_write_add(exp, *cnt); fsnotify_modify(file); + host_err = filemap_check_wb_err(file->f_mapping, since); + if (host_err < 0) + goto out_nfserr; if (stable && use_wgather) { host_err = wait_for_concurrent_writes(file); @@ -1101,19 +1104,6 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset, } #ifdef CONFIG_NFSD_V3 -static int -nfsd_filemap_write_and_wait_range(struct nfsd_file *nf, loff_t offset, - loff_t end) -{ - struct address_space *mapping = nf->nf_file->f_mapping; - int ret = filemap_fdatawrite_range(mapping, offset, end); - - if (ret) - return ret; - filemap_fdatawait_range_keep_errors(mapping, offset, end); - return 0; -} - /* * Commit all pending writes to stable storage. * @@ -1144,25 +1134,25 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, if (err) goto out; if (EX_ISSYNC(fhp->fh_export)) { - int err2 = nfsd_filemap_write_and_wait_range(nf, offset, end); + errseq_t since = READ_ONCE(nf->nf_file->f_wb_err); + int err2; - down_write(&nf->nf_rwsem); - if (!err2) - err2 = vfs_fsync_range(nf->nf_file, offset, end, 0); + err2 = vfs_fsync_range(nf->nf_file, offset, end, 0); switch (err2) { case 0: nfsd_copy_boot_verifier(verf, net_generic(nf->nf_net, nfsd_net_id)); + err2 = filemap_check_wb_err(nf->nf_file->f_mapping, + since); break; case -EINVAL: err = nfserr_notsupp; break; default: - err = nfserrno(err2); nfsd_reset_boot_verifier(net_generic(nf->nf_net, nfsd_net_id)); } - up_write(&nf->nf_rwsem); + err = nfserrno(err2); } else nfsd_copy_boot_verifier(verf, net_generic(nf->nf_net, nfsd_net_id)); From 33388b3aefefd4d83764dab8038cb54068161a44 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 28 Dec 2021 14:19:41 -0500 Subject: [PATCH 37/51] NFSD: Clean up nfsd_vfs_write() The RWF_SYNC and !RWF_SYNC arms are now exactly alike except that the RWF_SYNC arm resets the boot verifier twice in a row. Fix that redundancy and de-duplicate the code. Signed-off-by: Chuck Lever --- fs/nfsd/vfs.c | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 316ed702d518c..8f0ac710fd1a4 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -997,22 +997,11 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, iov_iter_kvec(&iter, WRITE, vec, vlen, *cnt); since = READ_ONCE(file->f_wb_err); - if (flags & RWF_SYNC) { - if (verf) - nfsd_copy_boot_verifier(verf, - net_generic(SVC_NET(rqstp), - nfsd_net_id)); - host_err = vfs_iter_write(file, &iter, &pos, flags); - if (host_err < 0) - nfsd_reset_boot_verifier(net_generic(SVC_NET(rqstp), - nfsd_net_id)); - } else { - if (verf) - nfsd_copy_boot_verifier(verf, - net_generic(SVC_NET(rqstp), - nfsd_net_id)); - host_err = vfs_iter_write(file, &iter, &pos, flags); - } + if (verf) + nfsd_copy_boot_verifier(verf, + net_generic(SVC_NET(rqstp), + nfsd_net_id)); + host_err = vfs_iter_write(file, &iter, &pos, flags); if (host_err < 0) { nfsd_reset_boot_verifier(net_generic(SVC_NET(rqstp), nfsd_net_id)); From fb7622c2dbd1aa41133a8c73e1137b833c074519 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 28 Dec 2021 12:41:32 -0500 Subject: [PATCH 38/51] NFSD: De-duplicate net_generic(SVC_NET(rqstp), nfsd_net_id) Since this pointer is used repeatedly, move it to a stack variable. Signed-off-by: Chuck Lever --- fs/nfsd/vfs.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 8f0ac710fd1a4..2e473d2f47e51 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -954,6 +954,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, unsigned long *cnt, int stable, __be32 *verf) { + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); struct file *file = nf->nf_file; struct super_block *sb = file_inode(file)->i_sb; struct svc_export *exp; @@ -998,13 +999,10 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, iov_iter_kvec(&iter, WRITE, vec, vlen, *cnt); since = READ_ONCE(file->f_wb_err); if (verf) - nfsd_copy_boot_verifier(verf, - net_generic(SVC_NET(rqstp), - nfsd_net_id)); + nfsd_copy_boot_verifier(verf, nn); host_err = vfs_iter_write(file, &iter, &pos, flags); if (host_err < 0) { - nfsd_reset_boot_verifier(net_generic(SVC_NET(rqstp), - nfsd_net_id)); + nfsd_reset_boot_verifier(nn); goto out_nfserr; } *cnt = host_err; @@ -1017,8 +1015,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, if (stable && use_wgather) { host_err = wait_for_concurrent_writes(file); if (host_err < 0) - nfsd_reset_boot_verifier(net_generic(SVC_NET(rqstp), - nfsd_net_id)); + nfsd_reset_boot_verifier(nn); } out_nfserr: From 2c445a0e72cb1fbfbdb7f9473c53556ee27c1d90 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 28 Dec 2021 14:26:03 -0500 Subject: [PATCH 39/51] NFSD: De-duplicate net_generic(nf->nf_net, nfsd_net_id) Since this pointer is used repeatedly, move it to a stack variable. Signed-off-by: Chuck Lever --- fs/nfsd/vfs.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 2e473d2f47e51..c22511decc4cc 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1103,6 +1103,7 @@ __be32 nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset, unsigned long count, __be32 *verf) { + struct nfsd_net *nn; struct nfsd_file *nf; loff_t end = LLONG_MAX; __be32 err = nfserr_inval; @@ -1119,6 +1120,7 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, NFSD_MAY_WRITE|NFSD_MAY_NOT_BREAK_LEASE, &nf); if (err) goto out; + nn = net_generic(nf->nf_net, nfsd_net_id); if (EX_ISSYNC(fhp->fh_export)) { errseq_t since = READ_ONCE(nf->nf_file->f_wb_err); int err2; @@ -1126,8 +1128,7 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, err2 = vfs_fsync_range(nf->nf_file, offset, end, 0); switch (err2) { case 0: - nfsd_copy_boot_verifier(verf, net_generic(nf->nf_net, - nfsd_net_id)); + nfsd_copy_boot_verifier(verf, nn); err2 = filemap_check_wb_err(nf->nf_file->f_mapping, since); break; @@ -1135,13 +1136,11 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, err = nfserr_notsupp; break; default: - nfsd_reset_boot_verifier(net_generic(nf->nf_net, - nfsd_net_id)); + nfsd_reset_boot_verifier(nn); } err = nfserrno(err2); } else - nfsd_copy_boot_verifier(verf, net_generic(nf->nf_net, - nfsd_net_id)); + nfsd_copy_boot_verifier(verf, nn); nfsd_file_put(nf); out: From a2f4c3fa4db94ba44d32a72201927cfd132a8e82 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 18 Dec 2021 20:38:00 -0500 Subject: [PATCH 40/51] nfsd: Add a tracepoint for errors in nfsd4_clone_file_range() Since a clone error commit can cause the boot verifier to change, we should trace those errors. Signed-off-by: Trond Myklebust Signed-off-by: Chuck Lever [ cel: Addressed a checkpatch.pl splat in fs/nfsd/vfs.h ] --- fs/nfsd/nfs4proc.c | 2 +- fs/nfsd/trace.h | 50 ++++++++++++++++++++++++++++++++++++++++++++++ fs/nfsd/vfs.c | 18 +++++++++++++++-- fs/nfsd/vfs.h | 3 ++- 4 files changed, 69 insertions(+), 4 deletions(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 56405fc58bfcc..43057080d2aa5 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1101,7 +1101,7 @@ nfsd4_clone(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (status) goto out; - status = nfsd4_clone_file_range(src, clone->cl_src_pos, + status = nfsd4_clone_file_range(rqstp, src, clone->cl_src_pos, dst, clone->cl_dst_pos, clone->cl_count, EX_ISSYNC(cstate->current_fh.fh_export)); diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 6afb320931042..a0b2b8d87de6f 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -399,6 +399,56 @@ TRACE_EVENT(nfsd_dirent, ) ) +DECLARE_EVENT_CLASS(nfsd_copy_err_class, + TP_PROTO(struct svc_rqst *rqstp, + struct svc_fh *src_fhp, + loff_t src_offset, + struct svc_fh *dst_fhp, + loff_t dst_offset, + u64 count, + int status), + TP_ARGS(rqstp, src_fhp, src_offset, dst_fhp, dst_offset, count, status), + TP_STRUCT__entry( + __field(u32, xid) + __field(u32, src_fh_hash) + __field(loff_t, src_offset) + __field(u32, dst_fh_hash) + __field(loff_t, dst_offset) + __field(u64, count) + __field(int, status) + ), + TP_fast_assign( + __entry->xid = be32_to_cpu(rqstp->rq_xid); + __entry->src_fh_hash = knfsd_fh_hash(&src_fhp->fh_handle); + __entry->src_offset = src_offset; + __entry->dst_fh_hash = knfsd_fh_hash(&dst_fhp->fh_handle); + __entry->dst_offset = dst_offset; + __entry->count = count; + __entry->status = status; + ), + TP_printk("xid=0x%08x src_fh_hash=0x%08x src_offset=%lld " + "dst_fh_hash=0x%08x dst_offset=%lld " + "count=%llu status=%d", + __entry->xid, __entry->src_fh_hash, __entry->src_offset, + __entry->dst_fh_hash, __entry->dst_offset, + (unsigned long long)__entry->count, + __entry->status) +) + +#define DEFINE_NFSD_COPY_ERR_EVENT(name) \ +DEFINE_EVENT(nfsd_copy_err_class, nfsd_##name, \ + TP_PROTO(struct svc_rqst *rqstp, \ + struct svc_fh *src_fhp, \ + loff_t src_offset, \ + struct svc_fh *dst_fhp, \ + loff_t dst_offset, \ + u64 count, \ + int status), \ + TP_ARGS(rqstp, src_fhp, src_offset, dst_fhp, dst_offset, \ + count, status)) + +DEFINE_NFSD_COPY_ERR_EVENT(clone_file_range_err); + #include "state.h" #include "filecache.h" #include "vfs.h" diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index c22511decc4cc..70ea7e0aae073 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -40,6 +40,7 @@ #include "../internal.h" #include "acl.h" #include "idmap.h" +#include "xdr4.h" #endif /* CONFIG_NFSD_V4 */ #include "nfsd.h" @@ -517,8 +518,15 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp, } #endif -__be32 nfsd4_clone_file_range(struct nfsd_file *nf_src, u64 src_pos, - struct nfsd_file *nf_dst, u64 dst_pos, u64 count, bool sync) +static struct nfsd4_compound_state *nfsd4_get_cstate(struct svc_rqst *rqstp) +{ + return &((struct nfsd4_compoundres *)rqstp->rq_resp)->cstate; +} + +__be32 nfsd4_clone_file_range(struct svc_rqst *rqstp, + struct nfsd_file *nf_src, u64 src_pos, + struct nfsd_file *nf_dst, u64 dst_pos, + u64 count, bool sync) { struct file *src = nf_src->nf_file; struct file *dst = nf_dst->nf_file; @@ -545,6 +553,12 @@ __be32 nfsd4_clone_file_range(struct nfsd_file *nf_src, u64 src_pos, if (!status) status = commit_inode_metadata(file_inode(src)); if (status < 0) { + trace_nfsd_clone_file_range_err(rqstp, + &nfsd4_get_cstate(rqstp)->save_fh, + src_pos, + &nfsd4_get_cstate(rqstp)->current_fh, + dst_pos, + count, status); nfsd_reset_boot_verifier(net_generic(nf_dst->nf_net, nfsd_net_id)); ret = nfserrno(status); diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index b21b76e6b9a87..9f56dcb22ff72 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -57,7 +57,8 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *, struct svc_fh *, struct xdr_netobj *); __be32 nfsd4_vfs_fallocate(struct svc_rqst *, struct svc_fh *, struct file *, loff_t, loff_t, int); -__be32 nfsd4_clone_file_range(struct nfsd_file *nf_src, u64 src_pos, +__be32 nfsd4_clone_file_range(struct svc_rqst *rqstp, + struct nfsd_file *nf_src, u64 src_pos, struct nfsd_file *nf_dst, u64 dst_pos, u64 count, bool sync); #endif /* CONFIG_NFSD_V4 */ From cdc556600c0133575487cc69fb3128440b3c3e92 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 30 Dec 2021 10:26:18 -0500 Subject: [PATCH 41/51] NFSD: Write verifier might go backwards When vfs_iter_write() starts to fail because a file system is full, a bunch of writes can fail at once with ENOSPC. These writes repeatedly invoke nfsd_reset_boot_verifier() in quick succession. Ensure that the time it grabs doesn't go backwards due to an ntp adjustment going on at the same time. Signed-off-by: Chuck Lever --- fs/nfsd/nfssvc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 14c1ef6f8cc74..6eccf67002504 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -363,7 +363,7 @@ void nfsd_copy_boot_verifier(__be32 verf[2], struct nfsd_net *nn) static void nfsd_reset_boot_verifier_locked(struct nfsd_net *nn) { - ktime_get_real_ts64(&nn->nfssvc_boot); + ktime_get_raw_ts64(&nn->nfssvc_boot); } void nfsd_reset_boot_verifier(struct nfsd_net *nn) From 91d2e9b56cf5c80f9efc530d494968369a8a0e0d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 29 Dec 2021 14:43:16 -0500 Subject: [PATCH 42/51] NFSD: Clean up the nfsd_net::nfssvc_boot field There are two boot-time fields in struct nfsd_net: one called boot_time and one called nfssvc_boot. The latter is used only to form write verifiers, but its documenting comment declares: /* Time of server startup */ Since commit 27c438f53e79 ("nfsd: Support the server resetting the boot verifier"), this field can be reset at any time; it's no longer tied to server restart. So that comment is stale. Also, according to pahole, struct timespec64 is 16 bytes long on x86_64. The nfssvc_boot field is used only to form a write verifier, which is 8 bytes long. Let's clarify this situation by manufacturing an 8-byte verifier in nfs_reset_boot_verifier() and storing only that in struct nfsd_net. We're grabbing 128 bits of time, so compress all of those into a 64-bit verifier instead of throwing out the high-order bits. In the future, the siphash_key can be re-used for other hashed objects per-nfsd_net. Signed-off-by: Chuck Lever --- fs/nfsd/netns.h | 8 +++++--- fs/nfsd/nfsctl.c | 3 ++- fs/nfsd/nfssvc.c | 51 ++++++++++++++++++++++++++++++++++++------------ 3 files changed, 45 insertions(+), 17 deletions(-) diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 9e8b77d2a3a47..a6ed300259849 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -11,6 +11,7 @@ #include #include #include +#include /* Hash tables for nfs4_clientid state */ #define CLIENT_HASH_BITS 4 @@ -108,9 +109,8 @@ struct nfsd_net { bool nfsd_net_up; bool lockd_up; - /* Time of server startup */ - struct timespec64 nfssvc_boot; - seqlock_t boot_lock; + seqlock_t writeverf_lock; + unsigned char writeverf[8]; /* * Max number of connections this nfsd container will allow. Defaults @@ -187,6 +187,8 @@ struct nfsd_net { char nfsd_name[UNX_MAXNODENAME+1]; struct nfsd_fcache_disposal *fcache_disposal; + + siphash_key_t siphash_key; }; /* Simple check to find out if a given net was properly initialized */ diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index a8ad71567fc72..b9f27fbcd7684 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1483,7 +1483,8 @@ static __net_init int nfsd_init_net(struct net *net) nn->clientid_counter = nn->clientid_base + 1; nn->s2s_cp_cl_id = nn->clientid_counter++; - seqlock_init(&nn->boot_lock); + get_random_bytes(&nn->siphash_key, sizeof(nn->siphash_key)); + seqlock_init(&nn->writeverf_lock); return 0; diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 6eccf67002504..81d47049588f2 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -344,33 +345,57 @@ static bool nfsd_needs_lockd(struct nfsd_net *nn) return nfsd_vers(nn, 2, NFSD_TEST) || nfsd_vers(nn, 3, NFSD_TEST); } +/** + * nfsd_copy_boot_verifier - Atomically copy a write verifier + * @verf: buffer in which to receive the verifier cookie + * @nn: NFS net namespace + * + * This function provides a wait-free mechanism for copying the + * namespace's boot verifier without tearing it. + */ void nfsd_copy_boot_verifier(__be32 verf[2], struct nfsd_net *nn) { int seq = 0; do { - read_seqbegin_or_lock(&nn->boot_lock, &seq); - /* - * This is opaque to client, so no need to byte-swap. Use - * __force to keep sparse happy. y2038 time_t overflow is - * irrelevant in this usage - */ - verf[0] = (__force __be32)nn->nfssvc_boot.tv_sec; - verf[1] = (__force __be32)nn->nfssvc_boot.tv_nsec; - } while (need_seqretry(&nn->boot_lock, seq)); - done_seqretry(&nn->boot_lock, seq); + read_seqbegin_or_lock(&nn->writeverf_lock, &seq); + memcpy(verf, nn->writeverf, sizeof(*verf)); + } while (need_seqretry(&nn->writeverf_lock, seq)); + done_seqretry(&nn->writeverf_lock, seq); } static void nfsd_reset_boot_verifier_locked(struct nfsd_net *nn) { - ktime_get_raw_ts64(&nn->nfssvc_boot); + struct timespec64 now; + u64 verf; + + /* + * Because the time value is hashed, y2038 time_t overflow + * is irrelevant in this usage. + */ + ktime_get_raw_ts64(&now); + verf = siphash_2u64(now.tv_sec, now.tv_nsec, &nn->siphash_key); + memcpy(nn->writeverf, &verf, sizeof(nn->writeverf)); } +/** + * nfsd_reset_boot_verifier - Generate a new boot verifier + * @nn: NFS net namespace + * + * This function updates the ->writeverf field of @nn. This field + * contains an opaque cookie that, according to Section 18.32.3 of + * RFC 8881, "the client can use to determine whether a server has + * changed instance state (e.g., server restart) between a call to + * WRITE and a subsequent call to either WRITE or COMMIT. This + * cookie MUST be unchanged during a single instance of the NFSv4.1 + * server and MUST be unique between instances of the NFSv4.1 + * server." + */ void nfsd_reset_boot_verifier(struct nfsd_net *nn) { - write_seqlock(&nn->boot_lock); + write_seqlock(&nn->writeverf_lock); nfsd_reset_boot_verifier_locked(nn); - write_sequnlock(&nn->boot_lock); + write_sequnlock(&nn->writeverf_lock); } static int nfsd_startup_net(struct net *net, const struct cred *cred) From 3988a57885eeac05ef89f0ab4d7e47b52fbcf630 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 30 Dec 2021 10:22:05 -0500 Subject: [PATCH 43/51] NFSD: Rename boot verifier functions Clean up: These functions handle what the specs call a write verifier, which in the Linux NFS server implementation is now divorced from the server's boot instance Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 2 +- fs/nfsd/netns.h | 4 ++-- fs/nfsd/nfs4proc.c | 2 +- fs/nfsd/nfssvc.c | 16 ++++++++-------- fs/nfsd/vfs.c | 16 ++++++++-------- 5 files changed, 20 insertions(+), 20 deletions(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index e2904540e463c..8bc807c5fea4c 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -243,7 +243,7 @@ nfsd_file_do_unhash(struct nfsd_file *nf) trace_nfsd_file_unhash(nf); if (nfsd_file_check_write_error(nf)) - nfsd_reset_boot_verifier(net_generic(nf->nf_net, nfsd_net_id)); + nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id)); --nfsd_file_hashtbl[nf->nf_hashval].nfb_count; hlist_del_rcu(&nf->nf_node); atomic_long_dec(&nfsd_filecache_count); diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index a6ed300259849..1b1a962a18041 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -198,6 +198,6 @@ extern void nfsd_netns_free_versions(struct nfsd_net *nn); extern unsigned int nfsd_net_id; -void nfsd_copy_boot_verifier(__be32 verf[2], struct nfsd_net *nn); -void nfsd_reset_boot_verifier(struct nfsd_net *nn); +void nfsd_copy_write_verifier(__be32 verf[2], struct nfsd_net *nn); +void nfsd_reset_write_verifier(struct nfsd_net *nn); #endif /* __NFSD_NETNS_H__ */ diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 43057080d2aa5..6f53eb90c6b44 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -598,7 +598,7 @@ static void gen_boot_verifier(nfs4_verifier *verifier, struct net *net) BUILD_BUG_ON(2*sizeof(*verf) != sizeof(verifier->data)); - nfsd_copy_boot_verifier(verf, net_generic(net, nfsd_net_id)); + nfsd_copy_write_verifier(verf, net_generic(net, nfsd_net_id)); } static __be32 diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 81d47049588f2..07193595b8e0a 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -346,14 +346,14 @@ static bool nfsd_needs_lockd(struct nfsd_net *nn) } /** - * nfsd_copy_boot_verifier - Atomically copy a write verifier + * nfsd_copy_write_verifier - Atomically copy a write verifier * @verf: buffer in which to receive the verifier cookie * @nn: NFS net namespace * * This function provides a wait-free mechanism for copying the - * namespace's boot verifier without tearing it. + * namespace's write verifier without tearing it. */ -void nfsd_copy_boot_verifier(__be32 verf[2], struct nfsd_net *nn) +void nfsd_copy_write_verifier(__be32 verf[2], struct nfsd_net *nn) { int seq = 0; @@ -364,7 +364,7 @@ void nfsd_copy_boot_verifier(__be32 verf[2], struct nfsd_net *nn) done_seqretry(&nn->writeverf_lock, seq); } -static void nfsd_reset_boot_verifier_locked(struct nfsd_net *nn) +static void nfsd_reset_write_verifier_locked(struct nfsd_net *nn) { struct timespec64 now; u64 verf; @@ -379,7 +379,7 @@ static void nfsd_reset_boot_verifier_locked(struct nfsd_net *nn) } /** - * nfsd_reset_boot_verifier - Generate a new boot verifier + * nfsd_reset_write_verifier - Generate a new write verifier * @nn: NFS net namespace * * This function updates the ->writeverf field of @nn. This field @@ -391,10 +391,10 @@ static void nfsd_reset_boot_verifier_locked(struct nfsd_net *nn) * server and MUST be unique between instances of the NFSv4.1 * server." */ -void nfsd_reset_boot_verifier(struct nfsd_net *nn) +void nfsd_reset_write_verifier(struct nfsd_net *nn) { write_seqlock(&nn->writeverf_lock); - nfsd_reset_boot_verifier_locked(nn); + nfsd_reset_write_verifier_locked(nn); write_sequnlock(&nn->writeverf_lock); } @@ -683,7 +683,7 @@ int nfsd_create_serv(struct net *net) register_inet6addr_notifier(&nfsd_inet6addr_notifier); #endif } - nfsd_reset_boot_verifier(nn); + nfsd_reset_write_verifier(nn); return 0; } diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 70ea7e0aae073..49564457bd3d8 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -559,8 +559,8 @@ __be32 nfsd4_clone_file_range(struct svc_rqst *rqstp, &nfsd4_get_cstate(rqstp)->current_fh, dst_pos, count, status); - nfsd_reset_boot_verifier(net_generic(nf_dst->nf_net, - nfsd_net_id)); + nfsd_reset_write_verifier(net_generic(nf_dst->nf_net, + nfsd_net_id)); ret = nfserrno(status); } } @@ -1013,10 +1013,10 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, iov_iter_kvec(&iter, WRITE, vec, vlen, *cnt); since = READ_ONCE(file->f_wb_err); if (verf) - nfsd_copy_boot_verifier(verf, nn); + nfsd_copy_write_verifier(verf, nn); host_err = vfs_iter_write(file, &iter, &pos, flags); if (host_err < 0) { - nfsd_reset_boot_verifier(nn); + nfsd_reset_write_verifier(nn); goto out_nfserr; } *cnt = host_err; @@ -1029,7 +1029,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, if (stable && use_wgather) { host_err = wait_for_concurrent_writes(file); if (host_err < 0) - nfsd_reset_boot_verifier(nn); + nfsd_reset_write_verifier(nn); } out_nfserr: @@ -1142,7 +1142,7 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, err2 = vfs_fsync_range(nf->nf_file, offset, end, 0); switch (err2) { case 0: - nfsd_copy_boot_verifier(verf, nn); + nfsd_copy_write_verifier(verf, nn); err2 = filemap_check_wb_err(nf->nf_file->f_mapping, since); break; @@ -1150,11 +1150,11 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, err = nfserr_notsupp; break; default: - nfsd_reset_boot_verifier(nn); + nfsd_reset_write_verifier(nn); } err = nfserrno(err2); } else - nfsd_copy_boot_verifier(verf, nn); + nfsd_copy_write_verifier(verf, nn); nfsd_file_put(nf); out: From 75acacb6583df0b9328dc701d8eeea05af49b8b5 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 28 Dec 2021 14:27:56 -0500 Subject: [PATCH 44/51] NFSD: Trace boot verifier resets According to commit bbf2f098838a ("nfsd: Reset the boot verifier on all write I/O errors"), the Linux NFS server forces all clients to resend pending unstable writes if any server-side write or commit operation encounters an error (say, ENOSPC). This is a rare and quite exceptional event that could require administrative recovery action, so it should be made trace-able. Example trace event: nfsd-938 [002] 7174.945558: nfsd_writeverf_reset: boot_time= 61cc920d xid=0xdcd62036 error=-28 new verifier=0x08aecc6142515904 Signed-off-by: Chuck Lever --- fs/nfsd/trace.h | 28 ++++++++++++++++++++++++++++ fs/nfsd/vfs.c | 13 ++++++++++--- 2 files changed, 38 insertions(+), 3 deletions(-) diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index a0b2b8d87de6f..c4cf563278430 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -574,6 +574,34 @@ DEFINE_EVENT(nfsd_net_class, nfsd_##name, \ DEFINE_NET_EVENT(grace_start); DEFINE_NET_EVENT(grace_complete); +TRACE_EVENT(nfsd_writeverf_reset, + TP_PROTO( + const struct nfsd_net *nn, + const struct svc_rqst *rqstp, + int error + ), + TP_ARGS(nn, rqstp, error), + TP_STRUCT__entry( + __field(unsigned long long, boot_time) + __field(u32, xid) + __field(int, error) + __array(unsigned char, verifier, NFS4_VERIFIER_SIZE) + ), + TP_fast_assign( + __entry->boot_time = nn->boot_time; + __entry->xid = be32_to_cpu(rqstp->rq_xid); + __entry->error = error; + + /* avoid seqlock inside TP_fast_assign */ + memcpy(__entry->verifier, nn->writeverf, + NFS4_VERIFIER_SIZE); + ), + TP_printk("boot_time=%16llx xid=0x%08x error=%d new verifier=0x%s", + __entry->boot_time, __entry->xid, __entry->error, + __print_hex_str(__entry->verifier, NFS4_VERIFIER_SIZE) + ) +); + TRACE_EVENT(nfsd_clid_cred_mismatch, TP_PROTO( const struct nfs4_client *clp, diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 49564457bd3d8..e4e59e1660e18 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -553,14 +553,17 @@ __be32 nfsd4_clone_file_range(struct svc_rqst *rqstp, if (!status) status = commit_inode_metadata(file_inode(src)); if (status < 0) { + struct nfsd_net *nn = net_generic(nf_dst->nf_net, + nfsd_net_id); + trace_nfsd_clone_file_range_err(rqstp, &nfsd4_get_cstate(rqstp)->save_fh, src_pos, &nfsd4_get_cstate(rqstp)->current_fh, dst_pos, count, status); - nfsd_reset_write_verifier(net_generic(nf_dst->nf_net, - nfsd_net_id)); + nfsd_reset_write_verifier(nn); + trace_nfsd_writeverf_reset(nn, rqstp, status); ret = nfserrno(status); } } @@ -1017,6 +1020,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, host_err = vfs_iter_write(file, &iter, &pos, flags); if (host_err < 0) { nfsd_reset_write_verifier(nn); + trace_nfsd_writeverf_reset(nn, rqstp, host_err); goto out_nfserr; } *cnt = host_err; @@ -1028,8 +1032,10 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, if (stable && use_wgather) { host_err = wait_for_concurrent_writes(file); - if (host_err < 0) + if (host_err < 0) { nfsd_reset_write_verifier(nn); + trace_nfsd_writeverf_reset(nn, rqstp, host_err); + } } out_nfserr: @@ -1151,6 +1157,7 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, break; default: nfsd_reset_write_verifier(nn); + trace_nfsd_writeverf_reset(nn, rqstp, err2); } err = nfserrno(err2); } else From 58f258f65267542959487dbe8b5641754411843d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 24 Dec 2021 14:22:28 -0500 Subject: [PATCH 45/51] Revert "nfsd: skip some unnecessary stats in the v4 case" On the wire, I observed NFSv4 OPEN(CREATE) operations sometimes returning a reasonable-looking value in the cinfo.before field and zero in the cinfo.after field. RFC 8881 Section 10.8.1 says: > When a client is making changes to a given directory, it needs to > determine whether there have been changes made to the directory by > other clients. It does this by using the change attribute as > reported before and after the directory operation in the associated > change_info4 value returned for the operation. and > ... The post-operation change > value needs to be saved as the basis for future change_info4 > comparisons. A good quality client implementation therefore saves the zero cinfo.after value. During a subsequent OPEN operation, it will receive a different non-zero value in the cinfo.before field for that directory, and it will incorrectly believe the directory has changed, triggering an undesirable directory cache invalidation. There are filesystem types where fs_supports_change_attribute() returns false, tmpfs being one. On NFSv4 mounts, this means the fh_getattr() call site in fill_pre_wcc() and fill_post_wcc() is never invoked. Subsequently, nfsd4_change_attribute() is invoked with an uninitialized @stat argument. In fill_pre_wcc(), @stat contains stale stack garbage, which is then placed on the wire. In fill_post_wcc(), ->fh_post_wc is all zeroes, so zero is placed on the wire. Both of these values are meaningless. This fix can be applied immediately to stable kernels. Once there are more regression tests in this area, this optimization can be attempted again. Fixes: 428a23d2bf0c ("nfsd: skip some unnecessary stats in the v4 case") Signed-off-by: Chuck Lever --- fs/nfsd/nfs3xdr.c | 44 +++++++++++++++++--------------------------- 1 file changed, 17 insertions(+), 27 deletions(-) diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index c3ac1b6aa3aaa..84088581bbe09 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -487,11 +487,6 @@ svcxdr_encode_wcc_data(struct svc_rqst *rqstp, struct xdr_stream *xdr, return true; } -static bool fs_supports_change_attribute(struct super_block *sb) -{ - return sb->s_flags & SB_I_VERSION || sb->s_export_op->fetch_iversion; -} - /* * Fill in the pre_op attr for the wcc data */ @@ -500,26 +495,24 @@ void fill_pre_wcc(struct svc_fh *fhp) struct inode *inode; struct kstat stat; bool v4 = (fhp->fh_maxsize == NFS4_FHSIZE); + __be32 err; if (fhp->fh_no_wcc || fhp->fh_pre_saved) return; inode = d_inode(fhp->fh_dentry); - if (fs_supports_change_attribute(inode->i_sb) || !v4) { - __be32 err = fh_getattr(fhp, &stat); - - if (err) { - /* Grab the times from inode anyway */ - stat.mtime = inode->i_mtime; - stat.ctime = inode->i_ctime; - stat.size = inode->i_size; - } - fhp->fh_pre_mtime = stat.mtime; - fhp->fh_pre_ctime = stat.ctime; - fhp->fh_pre_size = stat.size; + err = fh_getattr(fhp, &stat); + if (err) { + /* Grab the times from inode anyway */ + stat.mtime = inode->i_mtime; + stat.ctime = inode->i_ctime; + stat.size = inode->i_size; } if (v4) fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode); + fhp->fh_pre_mtime = stat.mtime; + fhp->fh_pre_ctime = stat.ctime; + fhp->fh_pre_size = stat.size; fhp->fh_pre_saved = true; } @@ -530,6 +523,7 @@ void fill_post_wcc(struct svc_fh *fhp) { bool v4 = (fhp->fh_maxsize == NFS4_FHSIZE); struct inode *inode = d_inode(fhp->fh_dentry); + __be32 err; if (fhp->fh_no_wcc) return; @@ -537,16 +531,12 @@ void fill_post_wcc(struct svc_fh *fhp) if (fhp->fh_post_saved) printk("nfsd: inode locked twice during operation.\n"); - fhp->fh_post_saved = true; - - if (fs_supports_change_attribute(inode->i_sb) || !v4) { - __be32 err = fh_getattr(fhp, &fhp->fh_post_attr); - - if (err) { - fhp->fh_post_saved = false; - fhp->fh_post_attr.ctime = inode->i_ctime; - } - } + err = fh_getattr(fhp, &fhp->fh_post_attr); + if (err) { + fhp->fh_post_saved = false; + fhp->fh_post_attr.ctime = inode->i_ctime; + } else + fhp->fh_post_saved = true; if (v4) fhp->fh_post_change = nfsd4_change_attribute(&fhp->fh_post_attr, inode); From fcb5e3fa012351f3b96024c07bc44834c2478213 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 24 Dec 2021 14:36:49 -0500 Subject: [PATCH 46/51] NFSD: Move fill_pre_wcc() and fill_post_wcc() These functions are related to file handle processing and have nothing to do with XDR encoding or decoding. Also they are no longer NFSv3-specific. As a clean-up, move their definitions to a more appropriate location. WCC is also an NFSv3-specific term, so rename them as general-purpose helpers. Signed-off-by: Chuck Lever --- fs/nfsd/nfs3xdr.c | 55 -------------------------------------- fs/nfsd/nfs4proc.c | 2 +- fs/nfsd/nfsfh.c | 66 +++++++++++++++++++++++++++++++++++++++++++++- fs/nfsd/nfsfh.h | 40 ++++++++++++++++++---------- fs/nfsd/vfs.c | 8 +++--- 5 files changed, 96 insertions(+), 75 deletions(-) diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 84088581bbe09..7c45ba4db61be 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -487,61 +487,6 @@ svcxdr_encode_wcc_data(struct svc_rqst *rqstp, struct xdr_stream *xdr, return true; } -/* - * Fill in the pre_op attr for the wcc data - */ -void fill_pre_wcc(struct svc_fh *fhp) -{ - struct inode *inode; - struct kstat stat; - bool v4 = (fhp->fh_maxsize == NFS4_FHSIZE); - __be32 err; - - if (fhp->fh_no_wcc || fhp->fh_pre_saved) - return; - inode = d_inode(fhp->fh_dentry); - err = fh_getattr(fhp, &stat); - if (err) { - /* Grab the times from inode anyway */ - stat.mtime = inode->i_mtime; - stat.ctime = inode->i_ctime; - stat.size = inode->i_size; - } - if (v4) - fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode); - - fhp->fh_pre_mtime = stat.mtime; - fhp->fh_pre_ctime = stat.ctime; - fhp->fh_pre_size = stat.size; - fhp->fh_pre_saved = true; -} - -/* - * Fill in the post_op attr for the wcc data - */ -void fill_post_wcc(struct svc_fh *fhp) -{ - bool v4 = (fhp->fh_maxsize == NFS4_FHSIZE); - struct inode *inode = d_inode(fhp->fh_dentry); - __be32 err; - - if (fhp->fh_no_wcc) - return; - - if (fhp->fh_post_saved) - printk("nfsd: inode locked twice during operation.\n"); - - err = fh_getattr(fhp, &fhp->fh_post_attr); - if (err) { - fhp->fh_post_saved = false; - fhp->fh_post_attr.ctime = inode->i_ctime; - } else - fhp->fh_post_saved = true; - if (v4) - fhp->fh_post_change = - nfsd4_change_attribute(&fhp->fh_post_attr, inode); -} - /* * XDR decode functions */ diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 6f53eb90c6b44..ed1ee25647bef 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -2530,7 +2530,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp) goto encode_op; } - fh_clear_wcc(current_fh); + fh_clear_pre_post_attrs(current_fh); /* If op is non-idempotent */ if (op->opdesc->op_flags & OP_MODIFIES_SOMETHING) { diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index f3779fa72c896..145208bcb9bd4 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -611,6 +611,70 @@ fh_update(struct svc_fh *fhp) return nfserr_serverfault; } +#ifdef CONFIG_NFSD_V3 + +/** + * fh_fill_pre_attrs - Fill in pre-op attributes + * @fhp: file handle to be updated + * + */ +void fh_fill_pre_attrs(struct svc_fh *fhp) +{ + bool v4 = (fhp->fh_maxsize == NFS4_FHSIZE); + struct inode *inode; + struct kstat stat; + __be32 err; + + if (fhp->fh_no_wcc || fhp->fh_pre_saved) + return; + + inode = d_inode(fhp->fh_dentry); + err = fh_getattr(fhp, &stat); + if (err) { + /* Grab the times from inode anyway */ + stat.mtime = inode->i_mtime; + stat.ctime = inode->i_ctime; + stat.size = inode->i_size; + } + if (v4) + fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode); + + fhp->fh_pre_mtime = stat.mtime; + fhp->fh_pre_ctime = stat.ctime; + fhp->fh_pre_size = stat.size; + fhp->fh_pre_saved = true; +} + +/** + * fh_fill_post_attrs - Fill in post-op attributes + * @fhp: file handle to be updated + * + */ +void fh_fill_post_attrs(struct svc_fh *fhp) +{ + bool v4 = (fhp->fh_maxsize == NFS4_FHSIZE); + struct inode *inode = d_inode(fhp->fh_dentry); + __be32 err; + + if (fhp->fh_no_wcc) + return; + + if (fhp->fh_post_saved) + printk("nfsd: inode locked twice during operation.\n"); + + err = fh_getattr(fhp, &fhp->fh_post_attr); + if (err) { + fhp->fh_post_saved = false; + fhp->fh_post_attr.ctime = inode->i_ctime; + } else + fhp->fh_post_saved = true; + if (v4) + fhp->fh_post_change = + nfsd4_change_attribute(&fhp->fh_post_attr, inode); +} + +#endif /* CONFIG_NFSD_V3 */ + /* * Release a file handle. */ @@ -623,7 +687,7 @@ fh_put(struct svc_fh *fhp) fh_unlock(fhp); fhp->fh_dentry = NULL; dput(dentry); - fh_clear_wcc(fhp); + fh_clear_pre_post_attrs(fhp); } fh_drop_write(fhp); if (exp) { diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index d11e4b6870d68..434930d8a946e 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -284,12 +284,13 @@ static inline u32 knfsd_fh_hash(const struct knfsd_fh *fh) #endif #ifdef CONFIG_NFSD_V3 -/* - * The wcc data stored in current_fh should be cleared - * between compound ops. + +/** + * fh_clear_pre_post_attrs - Reset pre/post attributes + * @fhp: file handle to be updated + * */ -static inline void -fh_clear_wcc(struct svc_fh *fhp) +static inline void fh_clear_pre_post_attrs(struct svc_fh *fhp) { fhp->fh_post_saved = false; fhp->fh_pre_saved = false; @@ -323,13 +324,24 @@ static inline u64 nfsd4_change_attribute(struct kstat *stat, return time_to_chattr(&stat->ctime); } -extern void fill_pre_wcc(struct svc_fh *fhp); -extern void fill_post_wcc(struct svc_fh *fhp); -#else -#define fh_clear_wcc(ignored) -#define fill_pre_wcc(ignored) -#define fill_post_wcc(notused) -#endif /* CONFIG_NFSD_V3 */ +extern void fh_fill_pre_attrs(struct svc_fh *fhp); +extern void fh_fill_post_attrs(struct svc_fh *fhp); + +#else /* !CONFIG_NFSD_V3 */ + +static inline void fh_clear_pre_post_attrs(struct svc_fh *fhp) +{ +} + +static inline void fh_fill_pre_attrs(struct svc_fh *fhp) +{ +} + +static inline void fh_fill_post_attrs(struct svc_fh *fhp) +{ +} + +#endif /* !CONFIG_NFSD_V3 */ /* @@ -355,7 +367,7 @@ fh_lock_nested(struct svc_fh *fhp, unsigned int subclass) inode = d_inode(dentry); inode_lock_nested(inode, subclass); - fill_pre_wcc(fhp); + fh_fill_pre_attrs(fhp); fhp->fh_locked = true; } @@ -372,7 +384,7 @@ static inline void fh_unlock(struct svc_fh *fhp) { if (fhp->fh_locked) { - fill_post_wcc(fhp); + fh_fill_post_attrs(fhp); inode_unlock(d_inode(fhp->fh_dentry)); fhp->fh_locked = false; } diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index e4e59e1660e18..99c2b9dfbb104 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1755,8 +1755,8 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, * so do it by hand */ trap = lock_rename(tdentry, fdentry); ffhp->fh_locked = tfhp->fh_locked = true; - fill_pre_wcc(ffhp); - fill_pre_wcc(tfhp); + fh_fill_pre_attrs(ffhp); + fh_fill_pre_attrs(tfhp); odentry = lookup_one_len(fname, fdentry, flen); host_err = PTR_ERR(odentry); @@ -1816,8 +1816,8 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, * were the same, so again we do it by hand. */ if (!close_cached) { - fill_post_wcc(ffhp); - fill_post_wcc(tfhp); + fh_fill_post_attrs(ffhp); + fh_fill_post_attrs(tfhp); } unlock_rename(tdentry, fdentry); ffhp->fh_locked = tfhp->fh_locked = false; From 7f4f5d70adfd88a08d6e122cfe2cf637ff84dd11 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 5 Jan 2022 20:12:13 -0500 Subject: [PATCH 47/51] MAINTAINERS: remove bfields I'm cutting back on my responsibilities. The NFS server and file locking code are in good hands. Signed-off-by: J. Bruce Fields Signed-off-by: Chuck Lever --- MAINTAINERS | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 13f9a84a617e3..0c753f25b6d24 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7334,7 +7334,6 @@ F: include/uapi/scsi/fc/ FILE LOCKING (flock() and fcntl()/lockf()) M: Jeff Layton -M: "J. Bruce Fields" L: linux-fsdevel@vger.kernel.org S: Maintained F: fs/fcntl.c @@ -10330,12 +10329,11 @@ S: Odd Fixes W: http://kernelnewbies.org/KernelJanitors KERNEL NFSD, SUNRPC, AND LOCKD SERVERS -M: "J. Bruce Fields" M: Chuck Lever L: linux-nfs@vger.kernel.org S: Supported W: http://nfs.sourceforge.net/ -T: git git://linux-nfs.org/~bfields/linux.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/cel/linux.git F: fs/lockd/ F: fs/nfs_common/ F: fs/nfsd/ From 074b07d94e0bb6ddce5690a9b7e2373088e8b33a Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 5 Jan 2022 14:15:03 -0500 Subject: [PATCH 48/51] nfsd: fix crash on COPY_NOTIFY with special stateid RTM says "If the special ONE stateid is passed to nfs4_preprocess_stateid_op(), it returns status=0 but does not set *cstid. nfsd4_copy_notify() depends on stid being set if status=0, and thus can crash if the client sends the right COPY_NOTIFY RPC." RFC 7862 says "The cna_src_stateid MUST refer to either open or locking states provided earlier by the server. If it is invalid, then the operation MUST fail." The RFC doesn't specify an error, and the choice doesn't matter much as this is clearly illegal client behavior, but bad_stateid seems reasonable. Simplest is just to guarantee that nfs4_preprocess_stateid_op, called with non-NULL cstid, errors out if it can't return a stateid. Reported-by: rtm@csail.mit.edu Fixes: 624322f1adc5 ("NFSD add COPY_NOTIFY operation") Signed-off-by: J. Bruce Fields Signed-off-by: Chuck Lever Reviewed-by: Olga Kornievskaia Tested-by: Olga Kornievskaia --- fs/nfsd/nfs4state.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index eb920f855969f..72900b89cf84c 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -6058,7 +6058,11 @@ nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, *nfp = NULL; if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) { - status = check_special_stateids(net, fhp, stateid, flags); + if (cstid) + status = nfserr_bad_stateid; + else + status = check_special_stateids(net, fhp, stateid, + flags); goto done; } From 0ea9fc15b1d7d6636d429e74ffe3f86bf2f2f7d6 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 23 Nov 2021 17:05:07 +0100 Subject: [PATCH 49/51] fs/locks: fix fcntl_getlk64/fcntl_setlk64 stub prototypes My patch to rework oabi fcntl64() introduced a harmless sparse warning when file locking is disabled: arch/arm/kernel/sys_oabi-compat.c:251:51: sparse: sparse: incorrect type in argument 3 (different address spaces) @@ expected struct flock64 [noderef] __user *user @@ got struct flock64 * @@ arch/arm/kernel/sys_oabi-compat.c:251:51: sparse: expected struct flock64 [noderef] __user *user arch/arm/kernel/sys_oabi-compat.c:251:51: sparse: got struct flock64 * arch/arm/kernel/sys_oabi-compat.c:265:55: sparse: sparse: incorrect type in argument 4 (different address spaces) @@ expected struct flock64 [noderef] __user *user @@ got struct flock64 * @@ arch/arm/kernel/sys_oabi-compat.c:265:55: sparse: expected struct flock64 [noderef] __user *user arch/arm/kernel/sys_oabi-compat.c:265:55: sparse: got struct flock64 * When file locking is enabled, everything works correctly and the right data gets passed, but the stub declarations in linux/fs.h did not get modified when the calling conventions changed in an earlier patch. Reported-by: kernel test robot Fixes: 7e2d8c29ecdd ("ARM: 9111/1: oabi-compat: rework fcntl64() emulation") Fixes: a75d30c77207 ("fs/locks: pass kernel struct flock to fcntl_getlk/setlk") Cc: Christoph Hellwig Reviewed-by: Christoph Hellwig Acked-by: Christian Brauner Signed-off-by: Arnd Bergmann Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/fs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index bbf812ce89a8c..5122d13775c2d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1220,13 +1220,13 @@ static inline int fcntl_setlk(unsigned int fd, struct file *file, #if BITS_PER_LONG == 32 static inline int fcntl_getlk64(struct file *file, unsigned int cmd, - struct flock64 __user *user) + struct flock64 *user) { return -EINVAL; } static inline int fcntl_setlk64(unsigned int fd, struct file *file, - unsigned int cmd, struct flock64 __user *user) + unsigned int cmd, struct flock64 *user) { return -EACCES; } From dc6c6fb3d639756a532bcc47d4a9bf9f3965881b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 9 Jan 2022 13:26:51 -0500 Subject: [PATCH 50/51] SUNRPC: Fix sockaddr handling in the svc_xprt_create_error trace point While testing, I got an unexpected KASAN splat: Jan 08 13:50:27 oracle-102.nfsv4.dev kernel: BUG: KASAN: stack-out-of-bounds in trace_event_raw_event_svc_xprt_create_err+0x190/0x210 [sunrpc] Jan 08 13:50:27 oracle-102.nfsv4.dev kernel: Read of size 28 at addr ffffc9000008f728 by task mount.nfs/4628 The memcpy() in the TP_fast_assign section of this trace point copies the size of the destination buffer in order that the buffer won't be overrun. In other similar trace points, the source buffer for this memcpy is a "struct sockaddr_storage" so the actual length of the source buffer is always long enough to prevent the memcpy from reading uninitialized or unallocated memory. However, for this trace point, the source buffer can be as small as a "struct sockaddr_in". For AF_INET sockaddrs, the memcpy() reads memory that follows the source buffer, which is not always valid memory. To avoid copying past the end of the passed-in sockaddr, make the source address's length available to the memcpy(). It would be a little nicer if the tracing infrastructure was more friendly about storing socket addresses that are not AF_INET, but I could not find a way to make printk("%pIS") work with a dynamic array. Reported-by: KASAN Fixes: 4b8f380e46e4 ("SUNRPC: Tracepoint to record errors in svc_xpo_create()") Signed-off-by: Chuck Lever --- include/trace/events/sunrpc.h | 5 +++-- net/sunrpc/svc_xprt.c | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 684cc0e322fa3..1c0a288f6a5c6 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -1744,10 +1744,11 @@ TRACE_EVENT(svc_xprt_create_err, const char *program, const char *protocol, struct sockaddr *sap, + size_t salen, const struct svc_xprt *xprt ), - TP_ARGS(program, protocol, sap, xprt), + TP_ARGS(program, protocol, sap, salen, xprt), TP_STRUCT__entry( __field(long, error) @@ -1760,7 +1761,7 @@ TRACE_EVENT(svc_xprt_create_err, __entry->error = PTR_ERR(xprt); __assign_str(program, program); __assign_str(protocol, protocol); - memcpy(__entry->addr, sap, sizeof(__entry->addr)); + memcpy(__entry->addr, sap, min(salen, sizeof(__entry->addr))); ), TP_printk("addr=%pISpc program=%s protocol=%s error=%ld", diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index b1744432489e3..1d8fc9d8da090 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -243,7 +243,7 @@ static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl, xprt = xcl->xcl_ops->xpo_create(serv, net, sap, len, flags); if (IS_ERR(xprt)) trace_svc_xprt_create_err(serv->sv_program->pg_name, - xcl->xcl_name, sap, xprt); + xcl->xcl_name, sap, len, xprt); return xprt; } From 16720861675393a35974532b3c837d9fd7bfe08c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 8 Jan 2022 16:59:54 -0500 Subject: [PATCH 51/51] SUNRPC: Fix sockaddr handling in svcsock_accept_class trace points Avoid potentially hazardous memory copying and the needless use of "%pIS" -- in the kernel, an RPC service listener is always bound to ANYADDR. Having the network namespace is helpful when recording errors, though. Fixes: a0469f46faab ("SUNRPC: Replace dprintk call sites in TCP state change callouts") Signed-off-by: Chuck Lever --- include/trace/events/sunrpc.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 1c0a288f6a5c6..1e566ac4b8123 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -2125,17 +2125,17 @@ DECLARE_EVENT_CLASS(svcsock_accept_class, TP_STRUCT__entry( __field(long, status) __string(service, service) - __array(unsigned char, addr, sizeof(struct sockaddr_in6)) + __field(unsigned int, netns_ino) ), TP_fast_assign( __entry->status = status; __assign_str(service, service); - memcpy(__entry->addr, &xprt->xpt_local, sizeof(__entry->addr)); + __entry->netns_ino = xprt->xpt_net->ns.inum; ), - TP_printk("listener=%pISpc service=%s status=%ld", - __entry->addr, __get_str(service), __entry->status + TP_printk("addr=listener service=%s status=%ld", + __get_str(service), __entry->status ) );