From dea450c90f463de57d7f351711a6ac7e89090843 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Tue, 2 Nov 2021 15:17:07 -0400 Subject: [PATCH 01/32] fs: dlm: remove obsolete INBUF define This patch removes an obsolete define for some length for an temporary buffer which is not being used anymore. The use of this define is not necessary anymore since commit 4798cbbfbd00 ("fs: dlm: rework receive handling"). Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/dlm_internal.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 5f57538b5d450..44a5c67b52134 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -41,12 +41,6 @@ #include #include "config.h" -/* Size of the temp buffer midcomms allocates on the stack. - We try to make this large enough so most messages fit. - FIXME: should sctp make this unnecessary? */ - -#define DLM_INBUF_LEN 148 - struct dlm_ls; struct dlm_lkb; struct dlm_rsb; From bb6866a5bdc5ff0236147c01394f6a264978a16c Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Tue, 2 Nov 2021 15:17:08 -0400 Subject: [PATCH 02/32] fs: dlm: fix small lockspace typo This patch fixes a typo from lockspace to lockspace. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lockspace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 10eddfa6c3d7b..b90566502a813 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -868,7 +868,7 @@ static int release_lockspace(struct dlm_ls *ls, int force) * until this returns. * * Force has 4 possible values: - * 0 - don't destroy locksapce if it has any LKBs + * 0 - don't destroy lockspace if it has any LKBs * 1 - destroy lockspace if it has remote LKBs but not if it has local LKBs * 2 - destroy lockspace regardless of LKBs * 3 - destroy lockspace as part of a forced shutdown From 1aafd9c231919dea9b10e654107e24d5c553c60d Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Tue, 2 Nov 2021 15:17:09 -0400 Subject: [PATCH 03/32] fs: dlm: debug improvements print nodeid This patch improves the debug output for midcomms layer by also printing out the nodeid where users counter belongs to. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/midcomms.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index 7ae39ec8d9b0a..008078f06813d 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -1231,7 +1231,7 @@ void dlm_midcomms_add_member(int nodeid) } node->users++; - pr_debug("users inc count %d\n", node->users); + pr_debug("node %d users inc count %d\n", nodeid, node->users); spin_unlock(&node->state_lock); srcu_read_unlock(&nodes_srcu, idx); @@ -1254,7 +1254,7 @@ void dlm_midcomms_remove_member(int nodeid) spin_lock(&node->state_lock); node->users--; - pr_debug("users dec count %d\n", node->users); + pr_debug("node %d users dec count %d\n", nodeid, node->users); /* hitting users count to zero means the * other side is running dlm_midcomms_stop() From fe93367541bcedaba1dd5cb9cf138eec0267ea56 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Tue, 2 Nov 2021 15:17:10 -0400 Subject: [PATCH 04/32] fs: dlm: remove check SCTP is loaded message Since commit 764ff4011424 ("fs: dlm: auto load sctp module") we try load the sctp module before we try to create a sctp kernel socket. That a socket creation fails now has more likely other reasons. This patch removes the part of error to load the sctp module and instead printout the error code. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 8f715c620e1f8..bee3757eb4c73 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1775,7 +1775,7 @@ static int dlm_listen_for_all(void) result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family, SOCK_STREAM, dlm_proto_ops->proto, &sock); if (result < 0) { - log_print("Can't create comms socket, check SCTP is loaded"); + log_print("Can't create comms socket: %d", result); goto out; } From 658bd576f95ed597e519cdadf1c86ac87c17aea5 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Tue, 2 Nov 2021 15:17:11 -0400 Subject: [PATCH 05/32] fs: dlm: move version conversion to compile time This patch moves version conversion to little endian from a runtime variable to compile time constant. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/midcomms.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index 008078f06813d..76bdc3a9dc61d 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -909,11 +909,11 @@ int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len) if (msglen > len) break; - switch (le32_to_cpu(hd->h_version)) { - case DLM_VERSION_3_1: + switch (hd->h_version) { + case cpu_to_le32(DLM_VERSION_3_1): dlm_midcomms_receive_buffer_3_1((union dlm_packet *)ptr, nodeid); break; - case DLM_VERSION_3_2: + case cpu_to_le32(DLM_VERSION_3_2): dlm_midcomms_receive_buffer_3_2((union dlm_packet *)ptr, nodeid); break; default: From 3e9736713d0cb2877b11ec7185b231bba7b21936 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Tue, 2 Nov 2021 15:17:12 -0400 Subject: [PATCH 06/32] fs: dlm: use dlm_recovery_stopped instead of test_bit This patch will change to use dlm_recovery_stopped() which is the dlm way to check if the LSFL_RECOVER_STOP flag in ls_flags by using the helper. It is an atomic operation but the check is still as before to fetch the value if ls_recover_lock is held. There might be more further investigations if the value can be changed afterwards and if it has any side effects. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/rcom.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c index 6cba86470278a..5821b777a1a74 100644 --- a/fs/dlm/rcom.c +++ b/fs/dlm/rcom.c @@ -601,7 +601,7 @@ void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid) spin_lock(&ls->ls_recover_lock); status = ls->ls_recover_status; - stop = test_bit(LSFL_RECOVER_STOP, &ls->ls_flags); + stop = dlm_recovery_stopped(ls); seq = ls->ls_recover_seq; spin_unlock(&ls->ls_recover_lock); From e10249b1902d3b0b71e99f518a695c2c39ab4fe6 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Tue, 2 Nov 2021 15:17:13 -0400 Subject: [PATCH 07/32] fs: dlm: use dlm_recovery_stopped in condition This patch will change to evaluate the dlm_recovery_stopped() in the condition of the if branch instead fetch it before evaluating the condition. As this is an atomic test-set operation it should be evaluated in the condition itself. Reported-by: Andreas Gruenbacher Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/dir.c | 3 +-- fs/dlm/member.c | 3 +-- fs/dlm/recoverd.c | 3 +-- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c index 45ebbe602bbf0..b6692f81ec83e 100644 --- a/fs/dlm/dir.c +++ b/fs/dlm/dir.c @@ -84,8 +84,7 @@ int dlm_recover_directory(struct dlm_ls *ls) for (;;) { int left; - error = dlm_recovery_stopped(ls); - if (error) { + if (dlm_recovery_stopped(ls)) { error = -EINTR; goto out_free; } diff --git a/fs/dlm/member.c b/fs/dlm/member.c index 731d489aa323e..61f906e705db8 100644 --- a/fs/dlm/member.c +++ b/fs/dlm/member.c @@ -442,8 +442,7 @@ static int ping_members(struct dlm_ls *ls) int error = 0; list_for_each_entry(memb, &ls->ls_nodes, list) { - error = dlm_recovery_stopped(ls); - if (error) { + if (dlm_recovery_stopped(ls)) { error = -EINTR; break; } diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c index 97d052cea5a92..a55dfce705dd2 100644 --- a/fs/dlm/recoverd.c +++ b/fs/dlm/recoverd.c @@ -124,8 +124,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) dlm_recover_waiters_pre(ls); - error = dlm_recovery_stopped(ls); - if (error) { + if (dlm_recovery_stopped(ls)) { error = -EINTR; goto fail; } From 2f05ec4327ffaa34877de67fc5bb5eb3ab3767f0 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Tue, 2 Nov 2021 15:17:14 -0400 Subject: [PATCH 08/32] fs: dlm: make dlm_callback_resume quite This patch makes dlm_callback_resume info printout less noisy by accumulate all callback queues into one printout not in 25 times steps. It seems this printout became lately quite noisy in relationship with gfs2. Before: [241767.849302] dlm: bin: dlm_callback_resume 25 [241767.854846] dlm: bin: dlm_callback_resume 25 [241767.860373] dlm: bin: dlm_callback_resume 25 ... [241767.865920] dlm: bin: dlm_callback_resume 25 [241767.871352] dlm: bin: dlm_callback_resume 25 [241767.876733] dlm: bin: dlm_callback_resume 25 After the patch: [ 385.485728] dlm: gfs2: dlm_callback_resume 175 if zero it will not be printed out. Reported-by: Barry Marson Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/ast.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c index 283c7b94eddad..6600930497ccc 100644 --- a/fs/dlm/ast.c +++ b/fs/dlm/ast.c @@ -295,7 +295,7 @@ void dlm_callback_suspend(struct dlm_ls *ls) void dlm_callback_resume(struct dlm_ls *ls) { struct dlm_lkb *lkb, *safe; - int count = 0; + int count = 0, sum = 0; clear_bit(LSFL_CB_DELAY, &ls->ls_flags); @@ -313,12 +313,14 @@ void dlm_callback_resume(struct dlm_ls *ls) } mutex_unlock(&ls->ls_cb_mutex); - if (count) - log_rinfo(ls, "dlm_callback_resume %d", count); + sum += count; if (count == MAX_CB_QUEUE) { count = 0; cond_resched(); goto more; } + + if (sum) + log_rinfo(ls, "%s %d", __func__, sum); } From f1d3b8f91d965c4fd900ac5dd06240cc9df0c7a7 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Tue, 2 Nov 2021 15:17:15 -0400 Subject: [PATCH 09/32] fs: dlm: initial support for tracepoints This patch adds initial support for dlm tracepoints. It will introduce tracepoints to dlm main functionality dlm_lock()/dlm_unlock() and their complete ast() callback or blocking bast() callback. The lock/unlock functionality has a start and end tracepoint, this is because there exists a race in case if would have a tracepoint at the end position only the complete/blocking callbacks could occur before. To work with eBPF tracing and using their lookup hash functionality there could be problems that an entry was not inserted yet. However use the start functionality for hash insert and check again in end functionality if there was an dlm internal error so there is no ast callback. In further it might also that locks with local masters will occur those callbacks immediately so we must have such functionality. I did not make everything accessible yet, although it seems eBPF can be used to access a lot of internal datastructures if it's aware of the struct definitions of the running kernel instance. We still can change it, if you do eBPF experiments e.g. time measurements between lock and callback functionality you can simple use the local lkb_id field as hash value in combination with the lockspace id if you have multiple lockspaces. Otherwise you can simple use trace-cmd for some functionality, e.g. `trace-cmd record -e dlm` and `trace-cmd report` afterwards. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/ast.c | 4 + fs/dlm/lock.c | 10 ++ fs/dlm/main.c | 3 + include/trace/events/dlm.h | 220 +++++++++++++++++++++++++++++++++++++ 4 files changed, 237 insertions(+) create mode 100644 include/trace/events/dlm.h diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c index 6600930497ccc..27bae7d4a477a 100644 --- a/fs/dlm/ast.c +++ b/fs/dlm/ast.c @@ -9,6 +9,8 @@ ******************************************************************************* ******************************************************************************/ +#include + #include "dlm_internal.h" #include "lock.h" #include "user.h" @@ -254,10 +256,12 @@ void dlm_callback_work(struct work_struct *work) continue; } else if (callbacks[i].flags & DLM_CB_BAST) { bastfn(lkb->lkb_astparam, callbacks[i].mode); + trace_dlm_bast(ls, lkb, callbacks[i].mode); } else if (callbacks[i].flags & DLM_CB_CAST) { lkb->lkb_lksb->sb_status = callbacks[i].sb_status; lkb->lkb_lksb->sb_flags = callbacks[i].sb_flags; castfn(lkb->lkb_astparam); + trace_dlm_ast(ls, lkb, lkb->lkb_lksb); } } diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index c502c065d0075..feb2e94f5879e 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -53,6 +53,8 @@ R: do_xxxx() L: receive_xxxx_reply() <- R: send_xxxx_reply() */ +#include + #include #include #include @@ -3437,6 +3439,8 @@ int dlm_lock(dlm_lockspace_t *lockspace, if (error) goto out; + trace_dlm_lock_start(ls, lkb, mode, flags); + error = set_lock_args(mode, lksb, flags, namelen, 0, ast, astarg, bast, &args); if (error) @@ -3450,6 +3454,8 @@ int dlm_lock(dlm_lockspace_t *lockspace, if (error == -EINPROGRESS) error = 0; out_put: + trace_dlm_lock_end(ls, lkb, mode, flags, error); + if (convert || error) __put_lkb(ls, lkb); if (error == -EAGAIN || error == -EDEADLK) @@ -3481,6 +3487,8 @@ int dlm_unlock(dlm_lockspace_t *lockspace, if (error) goto out; + trace_dlm_unlock_start(ls, lkb, flags); + error = set_unlock_args(flags, astarg, &args); if (error) goto out_put; @@ -3495,6 +3503,8 @@ int dlm_unlock(dlm_lockspace_t *lockspace, if (error == -EBUSY && (flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK))) error = 0; out_put: + trace_dlm_unlock_end(ls, lkb, flags, error); + dlm_put_lkb(lkb); out: dlm_unlock_recovery(ls); diff --git a/fs/dlm/main.c b/fs/dlm/main.c index afc66a1346d3d..1c5be4b70ac1b 100644 --- a/fs/dlm/main.c +++ b/fs/dlm/main.c @@ -19,6 +19,9 @@ #include "config.h" #include "lowcomms.h" +#define CREATE_TRACE_POINTS +#include + static int __init init_dlm(void) { int error; diff --git a/include/trace/events/dlm.h b/include/trace/events/dlm.h new file mode 100644 index 0000000000000..c97b4c163c3e9 --- /dev/null +++ b/include/trace/events/dlm.h @@ -0,0 +1,220 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM dlm + +#if !defined(_TRACE_DLM_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_DLM_H + +#include +#include +#include + +#include "../../../fs/dlm/dlm_internal.h" + +#define show_lock_flags(flags) __print_flags(flags, "|", \ + { DLM_LKF_NOQUEUE, "NOQUEUE" }, \ + { DLM_LKF_CANCEL, "CANCEL" }, \ + { DLM_LKF_CONVERT, "CONVERT" }, \ + { DLM_LKF_VALBLK, "VALBLK" }, \ + { DLM_LKF_QUECVT, "QUECVT" }, \ + { DLM_LKF_IVVALBLK, "IVVALBLK" }, \ + { DLM_LKF_CONVDEADLK, "CONVDEADLK" }, \ + { DLM_LKF_PERSISTENT, "PERSISTENT" }, \ + { DLM_LKF_NODLCKWT, "NODLCKWT" }, \ + { DLM_LKF_NODLCKBLK, "NODLCKBLK" }, \ + { DLM_LKF_EXPEDITE, "EXPEDITE" }, \ + { DLM_LKF_NOQUEUEBAST, "NOQUEUEBAST" }, \ + { DLM_LKF_HEADQUE, "HEADQUE" }, \ + { DLM_LKF_NOORDER, "NOORDER" }, \ + { DLM_LKF_ORPHAN, "ORPHAN" }, \ + { DLM_LKF_ALTPR, "ALTPR" }, \ + { DLM_LKF_ALTCW, "ALTCW" }, \ + { DLM_LKF_FORCEUNLOCK, "FORCEUNLOCK" }, \ + { DLM_LKF_TIMEOUT, "TIMEOUT" }) + +#define show_lock_mode(mode) __print_symbolic(mode, \ + { DLM_LOCK_IV, "IV"}, \ + { DLM_LOCK_NL, "NL"}, \ + { DLM_LOCK_CR, "CR"}, \ + { DLM_LOCK_CW, "CW"}, \ + { DLM_LOCK_PR, "PR"}, \ + { DLM_LOCK_PW, "PW"}, \ + { DLM_LOCK_EX, "EX"}) + +#define show_dlm_sb_flags(flags) __print_flags(flags, "|", \ + { DLM_SBF_DEMOTED, "DEMOTED" }, \ + { DLM_SBF_VALNOTVALID, "VALNOTVALID" }, \ + { DLM_SBF_ALTMODE, "ALTMODE" }) + +/* note: we begin tracing dlm_lock_start() only if ls and lkb are found */ +TRACE_EVENT(dlm_lock_start, + + TP_PROTO(struct dlm_ls *ls, struct dlm_lkb *lkb, int mode, + __u32 flags), + + TP_ARGS(ls, lkb, mode, flags), + + TP_STRUCT__entry( + __field(__u32, ls_id) + __field(__u32, lkb_id) + __field(int, mode) + __field(__u32, flags) + ), + + TP_fast_assign( + __entry->ls_id = ls->ls_global_id; + __entry->lkb_id = lkb->lkb_id; + __entry->mode = mode; + __entry->flags = flags; + ), + + TP_printk("ls_id=%u lkb_id=%x mode=%s flags=%s", + __entry->ls_id, __entry->lkb_id, + show_lock_mode(__entry->mode), + show_lock_flags(__entry->flags)) + +); + +TRACE_EVENT(dlm_lock_end, + + TP_PROTO(struct dlm_ls *ls, struct dlm_lkb *lkb, int mode, __u32 flags, + int error), + + TP_ARGS(ls, lkb, mode, flags, error), + + TP_STRUCT__entry( + __field(__u32, ls_id) + __field(__u32, lkb_id) + __field(int, mode) + __field(__u32, flags) + __field(int, error) + ), + + TP_fast_assign( + __entry->ls_id = ls->ls_global_id; + __entry->lkb_id = lkb->lkb_id; + __entry->mode = mode; + __entry->flags = flags; + + /* return value will be zeroed in those cases by dlm_lock() + * we do it here again to not introduce more overhead if + * trace isn't running and error reflects the return value. + */ + if (error == -EAGAIN || error == -EDEADLK) + __entry->error = 0; + else + __entry->error = error; + ), + + TP_printk("ls_id=%u lkb_id=%x mode=%s flags=%s error=%d", + __entry->ls_id, __entry->lkb_id, + show_lock_mode(__entry->mode), + show_lock_flags(__entry->flags), __entry->error) + +); + +TRACE_EVENT(dlm_bast, + + TP_PROTO(struct dlm_ls *ls, struct dlm_lkb *lkb, int mode), + + TP_ARGS(ls, lkb, mode), + + TP_STRUCT__entry( + __field(__u32, ls_id) + __field(__u32, lkb_id) + __field(int, mode) + ), + + TP_fast_assign( + __entry->ls_id = ls->ls_global_id; + __entry->lkb_id = lkb->lkb_id; + __entry->mode = mode; + ), + + TP_printk("ls_id=%u lkb_id=%x mode=%s", __entry->ls_id, + __entry->lkb_id, show_lock_mode(__entry->mode)) + +); + +TRACE_EVENT(dlm_ast, + + TP_PROTO(struct dlm_ls *ls, struct dlm_lkb *lkb, struct dlm_lksb *lksb), + + TP_ARGS(ls, lkb, lksb), + + TP_STRUCT__entry( + __field(__u32, ls_id) + __field(__u32, lkb_id) + __field(u8, sb_flags) + __field(int, sb_status) + ), + + TP_fast_assign( + __entry->ls_id = ls->ls_global_id; + __entry->lkb_id = lkb->lkb_id; + __entry->sb_flags = lksb->sb_flags; + __entry->sb_status = lksb->sb_status; + ), + + TP_printk("ls_id=%u lkb_id=%x sb_flags=%s sb_status=%d", + __entry->ls_id, __entry->lkb_id, + show_dlm_sb_flags(__entry->sb_flags), __entry->sb_status) + +); + +/* note: we begin tracing dlm_unlock_start() only if ls and lkb are found */ +TRACE_EVENT(dlm_unlock_start, + + TP_PROTO(struct dlm_ls *ls, struct dlm_lkb *lkb, __u32 flags), + + TP_ARGS(ls, lkb, flags), + + TP_STRUCT__entry( + __field(__u32, ls_id) + __field(__u32, lkb_id) + __field(__u32, flags) + ), + + TP_fast_assign( + __entry->ls_id = ls->ls_global_id; + __entry->lkb_id = lkb->lkb_id; + __entry->flags = flags; + ), + + TP_printk("ls_id=%u lkb_id=%x flags=%s", + __entry->ls_id, __entry->lkb_id, + show_lock_flags(__entry->flags)) + +); + +TRACE_EVENT(dlm_unlock_end, + + TP_PROTO(struct dlm_ls *ls, struct dlm_lkb *lkb, __u32 flags, + int error), + + TP_ARGS(ls, lkb, flags, error), + + TP_STRUCT__entry( + __field(__u32, ls_id) + __field(__u32, lkb_id) + __field(__u32, flags) + __field(int, error) + ), + + TP_fast_assign( + __entry->ls_id = ls->ls_global_id; + __entry->lkb_id = lkb->lkb_id; + __entry->flags = flags; + __entry->error = error; + ), + + TP_printk("ls_id=%u lkb_id=%x flags=%s error=%d", + __entry->ls_id, __entry->lkb_id, + show_lock_flags(__entry->flags), __entry->error) + +); + +#endif /* if !defined(_TRACE_DLM_H) || defined(TRACE_HEADER_MULTI_READ) */ + +/* This part must be outside protection */ +#include From 92732376fd29462b502f41486bcef55f49c5713e Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Tue, 2 Nov 2021 15:17:16 -0400 Subject: [PATCH 10/32] fs: dlm: trace socket handling This patch adds tracepoints for dlm socket receive and send functionality. We can use it to track how much data was send or received to or from a specific nodeid. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 4 ++++ include/trace/events/dlm.h | 40 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index bee3757eb4c73..6d6dcf0d5ba97 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -53,6 +53,8 @@ #include #include +#include + #include "dlm_internal.h" #include "lowcomms.h" #include "midcomms.h" @@ -925,6 +927,7 @@ static int receive_from_sock(struct connection *con) msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; ret = kernel_recvmsg(con->sock, &msg, &iov, 1, iov.iov_len, msg.msg_flags); + trace_dlm_recv(con->nodeid, ret); if (ret == -EAGAIN) break; else if (ret <= 0) @@ -1411,6 +1414,7 @@ static void send_to_sock(struct connection *con) ret = kernel_sendpage(con->sock, e->page, offset, len, msg_flags); + trace_dlm_send(con->nodeid, ret); if (ret == -EAGAIN || ret == 0) { if (ret == -EAGAIN && test_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags) && diff --git a/include/trace/events/dlm.h b/include/trace/events/dlm.h index c97b4c163c3e9..32088c6032445 100644 --- a/include/trace/events/dlm.h +++ b/include/trace/events/dlm.h @@ -214,6 +214,46 @@ TRACE_EVENT(dlm_unlock_end, ); +TRACE_EVENT(dlm_send, + + TP_PROTO(int nodeid, int ret), + + TP_ARGS(nodeid, ret), + + TP_STRUCT__entry( + __field(int, nodeid) + __field(int, ret) + ), + + TP_fast_assign( + __entry->nodeid = nodeid; + __entry->ret = ret; + ), + + TP_printk("nodeid=%d ret=%d", __entry->nodeid, __entry->ret) + +); + +TRACE_EVENT(dlm_recv, + + TP_PROTO(int nodeid, int ret), + + TP_ARGS(nodeid, ret), + + TP_STRUCT__entry( + __field(int, nodeid) + __field(int, ret) + ), + + TP_fast_assign( + __entry->nodeid = nodeid; + __entry->ret = ret; + ), + + TP_printk("nodeid=%d ret=%d", __entry->nodeid, __entry->ret) + +); + #endif /* if !defined(_TRACE_DLM_H) || defined(TRACE_HEADER_MULTI_READ) */ /* This part must be outside protection */ From 164d88abd7608e869b7617d5ff8893344fdda759 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Tue, 2 Nov 2021 15:17:17 -0400 Subject: [PATCH 11/32] fs: dlm: requestqueue busy wait to event based wait This patch changes the requestqueue busy waiting algorithm to use atomic counter values and wait_event() to wait until the requestqueue is empty. It will slightly reduce the number of holding ls_requestqueue_mutex mutex. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/dlm_internal.h | 2 ++ fs/dlm/lockspace.c | 2 ++ fs/dlm/requestqueue.c | 15 +++++++-------- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 44a5c67b52134..fd1c7a8c44855 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -626,6 +626,8 @@ struct dlm_ls { struct rw_semaphore ls_in_recovery; /* block local requests */ struct rw_semaphore ls_recv_active; /* block dlm_recv */ struct list_head ls_requestqueue;/* queue remote requests */ + atomic_t ls_requestqueue_cnt; + wait_queue_head_t ls_requestqueue_wait; struct mutex ls_requestqueue_mutex; struct dlm_rcom *ls_recover_buf; int ls_recover_nodeid; /* for debugging */ diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index b90566502a813..4e4181304ca16 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -564,6 +564,8 @@ static int new_lockspace(const char *name, const char *cluster, init_rwsem(&ls->ls_in_recovery); init_rwsem(&ls->ls_recv_active); INIT_LIST_HEAD(&ls->ls_requestqueue); + atomic_set(&ls->ls_requestqueue_cnt, 0); + init_waitqueue_head(&ls->ls_requestqueue_wait); mutex_init(&ls->ls_requestqueue_mutex); mutex_init(&ls->ls_clear_proc_locks); diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c index e89e0ff8bfa3a..d0cf68570dcf6 100644 --- a/fs/dlm/requestqueue.c +++ b/fs/dlm/requestqueue.c @@ -44,6 +44,7 @@ void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_message *ms) e->nodeid = nodeid; memcpy(&e->request, ms, ms->m_header.h_length); + atomic_inc(&ls->ls_requestqueue_cnt); mutex_lock(&ls->ls_requestqueue_mutex); list_add_tail(&e->list, &ls->ls_requestqueue); mutex_unlock(&ls->ls_requestqueue_mutex); @@ -89,6 +90,8 @@ int dlm_process_requestqueue(struct dlm_ls *ls) mutex_lock(&ls->ls_requestqueue_mutex); list_del(&e->list); + if (atomic_dec_and_test(&ls->ls_requestqueue_cnt)) + wake_up(&ls->ls_requestqueue_wait); kfree(e); if (dlm_locking_stopped(ls)) { @@ -115,14 +118,8 @@ int dlm_process_requestqueue(struct dlm_ls *ls) void dlm_wait_requestqueue(struct dlm_ls *ls) { - for (;;) { - mutex_lock(&ls->ls_requestqueue_mutex); - if (list_empty(&ls->ls_requestqueue)) - break; - mutex_unlock(&ls->ls_requestqueue_mutex); - schedule(); - } - mutex_unlock(&ls->ls_requestqueue_mutex); + wait_event(ls->ls_requestqueue_wait, + atomic_read(&ls->ls_requestqueue_cnt) == 0); } static int purge_request(struct dlm_ls *ls, struct dlm_message *ms, int nodeid) @@ -161,6 +158,8 @@ void dlm_purge_requestqueue(struct dlm_ls *ls) if (purge_request(ls, ms, e->nodeid)) { list_del(&e->list); + if (atomic_dec_and_test(&ls->ls_requestqueue_cnt)) + wake_up(&ls->ls_requestqueue_wait); kfree(e); } } From 3cb5977c5214c219b2859f926ed547480d53fdde Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Tue, 2 Nov 2021 15:17:18 -0400 Subject: [PATCH 12/32] fs: dlm: ls_count busy wait to event based wait This patch changes the ls_count busy wait to use atomic counter values and wait_event() to wait until ls_count reach zero. It will slightly reduce the number of holding lslist_lock. At remove lockspace we need to retry the wait because it a lockspace get could interefere between wait_event() and holding the lock which deletes the lockspace list entry. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/dlm_internal.h | 3 ++- fs/dlm/lockspace.c | 33 +++++++++++++++++---------------- fs/dlm/requestqueue.c | 2 +- 3 files changed, 20 insertions(+), 18 deletions(-) diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index fd1c7a8c44855..019931804af9b 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -548,8 +548,9 @@ struct dlm_ls { uint32_t ls_generation; uint32_t ls_exflags; int ls_lvblen; - int ls_count; /* refcount of processes in + atomic_t ls_count; /* refcount of processes in the dlm using this ls */ + wait_queue_head_t ls_count_wait; int ls_create_count; /* create/release refcount */ unsigned long ls_flags; /* LSFL_ */ unsigned long ls_scan_time; diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 4e4181304ca16..2e51bd2bdacce 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -314,7 +314,7 @@ struct dlm_ls *dlm_find_lockspace_global(uint32_t id) list_for_each_entry(ls, &lslist, ls_list) { if (ls->ls_global_id == id) { - ls->ls_count++; + atomic_inc(&ls->ls_count); goto out; } } @@ -331,7 +331,7 @@ struct dlm_ls *dlm_find_lockspace_local(dlm_lockspace_t *lockspace) spin_lock(&lslist_lock); list_for_each_entry(ls, &lslist, ls_list) { if (ls->ls_local_handle == lockspace) { - ls->ls_count++; + atomic_inc(&ls->ls_count); goto out; } } @@ -348,7 +348,7 @@ struct dlm_ls *dlm_find_lockspace_device(int minor) spin_lock(&lslist_lock); list_for_each_entry(ls, &lslist, ls_list) { if (ls->ls_device.minor == minor) { - ls->ls_count++; + atomic_inc(&ls->ls_count); goto out; } } @@ -360,24 +360,24 @@ struct dlm_ls *dlm_find_lockspace_device(int minor) void dlm_put_lockspace(struct dlm_ls *ls) { - spin_lock(&lslist_lock); - ls->ls_count--; - spin_unlock(&lslist_lock); + if (atomic_dec_and_test(&ls->ls_count)) + wake_up(&ls->ls_count_wait); } static void remove_lockspace(struct dlm_ls *ls) { - for (;;) { - spin_lock(&lslist_lock); - if (ls->ls_count == 0) { - WARN_ON(ls->ls_create_count != 0); - list_del(&ls->ls_list); - spin_unlock(&lslist_lock); - return; - } +retry: + wait_event(ls->ls_count_wait, atomic_read(&ls->ls_count) == 0); + + spin_lock(&lslist_lock); + if (atomic_read(&ls->ls_count) != 0) { spin_unlock(&lslist_lock); - ssleep(1); + goto retry; } + + WARN_ON(ls->ls_create_count != 0); + list_del(&ls->ls_list); + spin_unlock(&lslist_lock); } static int threads_start(void) @@ -481,7 +481,8 @@ static int new_lockspace(const char *name, const char *cluster, memcpy(ls->ls_name, name, namelen); ls->ls_namelen = namelen; ls->ls_lvblen = lvblen; - ls->ls_count = 0; + atomic_set(&ls->ls_count, 0); + init_waitqueue_head(&ls->ls_count_wait); ls->ls_flags = 0; ls->ls_scan_time = jiffies; diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c index d0cf68570dcf6..ccb5307c21e90 100644 --- a/fs/dlm/requestqueue.c +++ b/fs/dlm/requestqueue.c @@ -127,7 +127,7 @@ static int purge_request(struct dlm_ls *ls, struct dlm_message *ms, int nodeid) uint32_t type = ms->m_type; /* the ls is being cleaned up and freed by release_lockspace */ - if (!ls->ls_count) + if (!atomic_read(&ls->ls_count)) return 1; if (dlm_is_removed(ls, nodeid)) From 5c16febbc19bb463bfb8e80cb5b24ec6ff1a439f Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Tue, 2 Nov 2021 15:17:19 -0400 Subject: [PATCH 13/32] fs: dlm: let handle callback data as void This patch changes the dlm_lowcomms_new_msg() function pointer private data from "struct mhandle *" to "void *" to provide different structures than just "struct mhandle". Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 19 +++++++++---------- fs/dlm/lowcomms.h | 4 ++-- fs/dlm/midcomms.c | 4 +++- 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 6d6dcf0d5ba97..3f8b015ba7990 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1205,8 +1205,7 @@ static struct writequeue_entry *new_writequeue_entry(struct connection *con, static struct writequeue_entry *new_wq_entry(struct connection *con, int len, gfp_t allocation, char **ppc, - void (*cb)(struct dlm_mhandle *mh), - struct dlm_mhandle *mh) + void (*cb)(void *data), void *data) { struct writequeue_entry *e; @@ -1218,7 +1217,7 @@ static struct writequeue_entry *new_wq_entry(struct connection *con, int len, *ppc = page_address(e->page) + e->end; if (cb) - cb(mh); + cb(data); e->end += len; e->users++; @@ -1240,7 +1239,7 @@ static struct writequeue_entry *new_wq_entry(struct connection *con, int len, spin_lock(&con->writequeue_lock); if (cb) - cb(mh); + cb(data); list_add_tail(&e->list, &con->writequeue); spin_unlock(&con->writequeue_lock); @@ -1250,8 +1249,8 @@ static struct writequeue_entry *new_wq_entry(struct connection *con, int len, static struct dlm_msg *dlm_lowcomms_new_msg_con(struct connection *con, int len, gfp_t allocation, char **ppc, - void (*cb)(struct dlm_mhandle *mh), - struct dlm_mhandle *mh) + void (*cb)(void *data), + void *data) { struct writequeue_entry *e; struct dlm_msg *msg; @@ -1274,7 +1273,7 @@ static struct dlm_msg *dlm_lowcomms_new_msg_con(struct connection *con, int len, kref_init(&msg->ref); - e = new_wq_entry(con, len, allocation, ppc, cb, mh); + e = new_wq_entry(con, len, allocation, ppc, cb, data); if (!e) { if (sleepable) mutex_unlock(&con->wq_alloc); @@ -1294,8 +1293,8 @@ static struct dlm_msg *dlm_lowcomms_new_msg_con(struct connection *con, int len, } struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation, - char **ppc, void (*cb)(struct dlm_mhandle *mh), - struct dlm_mhandle *mh) + char **ppc, void (*cb)(void *data), + void *data) { struct connection *con; struct dlm_msg *msg; @@ -1316,7 +1315,7 @@ struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation, return NULL; } - msg = dlm_lowcomms_new_msg_con(con, len, allocation, ppc, cb, mh); + msg = dlm_lowcomms_new_msg_con(con, len, allocation, ppc, cb, data); if (!msg) { srcu_read_unlock(&connections_srcu, idx); return NULL; diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h index 4ccae07cf0058..8108ea24ec301 100644 --- a/fs/dlm/lowcomms.h +++ b/fs/dlm/lowcomms.h @@ -38,8 +38,8 @@ void dlm_lowcomms_stop(void); void dlm_lowcomms_exit(void); int dlm_lowcomms_close(int nodeid); struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation, - char **ppc, void (*cb)(struct dlm_mhandle *mh), - struct dlm_mhandle *mh); + char **ppc, void (*cb)(void *data), + void *data); void dlm_lowcomms_commit_msg(struct dlm_msg *msg); void dlm_lowcomms_put_msg(struct dlm_msg *msg); int dlm_lowcomms_resend_msg(struct dlm_msg *msg); diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index 76bdc3a9dc61d..95a5643a950e3 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -1020,8 +1020,10 @@ static void dlm_fill_opts_header(struct dlm_opts *opts, uint16_t inner_len, header_out(&opts->o_header); } -static void midcomms_new_msg_cb(struct dlm_mhandle *mh) +static void midcomms_new_msg_cb(void *data) { + struct dlm_mhandle *mh = data; + atomic_inc(&mh->node->send_queue_cnt); spin_lock(&mh->node->send_queue_lock); From 9af5b8f0ead7cd90161b0555ed8e85ee38f79fa5 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Tue, 2 Nov 2021 15:17:20 -0400 Subject: [PATCH 14/32] fs: dlm: add debugfs rawmsg send functionality This patch adds a dlm functionality to send a raw dlm message to a specific cluster node. This raw message can be build by user space and send out by writing the message to "rawmsg" dlm debugfs file. There is a in progress scapy dlm module which provides a easy build of DLM messages in user space. For example: DLM(h_cmd=3, o_nextcmd=1, h_nodeid=1, h_lockspace=0xe4f48a18, ...) The goal is to provide an easy reproducable state to crash DLM or to fuzz the DLM kernel stack if there are possible ways to crash it. Note: that if the sequence number is zero and dlm version is not set to 3.1 the kernel will automatic will set a right sequence number, otherwise DLM stack testing is not possible. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/debug_fs.c | 37 ++++++++++++++++++++++++++++++++++++ fs/dlm/midcomms.c | 48 +++++++++++++++++++++++++++++++++++++++++++++++ fs/dlm/midcomms.h | 2 ++ 3 files changed, 87 insertions(+) diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c index 47e9d57e4cae3..555904eeea8ea 100644 --- a/fs/dlm/debug_fs.c +++ b/fs/dlm/debug_fs.c @@ -768,6 +768,42 @@ static int dlm_version_show(struct seq_file *file, void *offset) } DEFINE_SHOW_ATTRIBUTE(dlm_version); +static ssize_t dlm_rawmsg_write(struct file *fp, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + void *buf; + int ret; + + if (count > PAGE_SIZE || count < sizeof(struct dlm_header)) + return -EINVAL; + + buf = kmalloc(PAGE_SIZE, GFP_NOFS); + if (!buf) + return -ENOMEM; + + if (copy_from_user(buf, user_buf, count)) { + ret = -EFAULT; + goto out; + } + + ret = dlm_midcomms_rawmsg_send(fp->private_data, buf, count); + if (ret) + goto out; + + kfree(buf); + return count; + +out: + kfree(buf); + return ret; +} + +static const struct file_operations dlm_rawmsg_fops = { + .open = simple_open, + .write = dlm_rawmsg_write, + .llseek = no_llseek, +}; + void *dlm_create_debug_comms_file(int nodeid, void *data) { struct dentry *d_node; @@ -782,6 +818,7 @@ void *dlm_create_debug_comms_file(int nodeid, void *data) debugfs_create_file("send_queue_count", 0444, d_node, data, &dlm_send_queue_cnt_fops); debugfs_create_file("version", 0444, d_node, data, &dlm_version_fops); + debugfs_create_file("rawmsg", 0200, d_node, data, &dlm_rawmsg_fops); return d_node; } diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index 95a5643a950e3..0b9bce6f04e14 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -1427,3 +1427,51 @@ int dlm_midcomms_close(int nodeid) return ret; } + +/* debug functionality to send raw dlm msg from user space */ +struct dlm_rawmsg_data { + struct midcomms_node *node; + void *buf; +}; + +static void midcomms_new_rawmsg_cb(void *data) +{ + struct dlm_rawmsg_data *rd = data; + struct dlm_header *h = rd->buf; + + switch (h->h_version) { + case cpu_to_le32(DLM_VERSION_3_1): + break; + default: + switch (h->h_cmd) { + case DLM_OPTS: + if (!h->u.h_seq) + h->u.h_seq = rd->node->seq_send++; + break; + default: + break; + } + break; + } +} + +int dlm_midcomms_rawmsg_send(struct midcomms_node *node, void *buf, + int buflen) +{ + struct dlm_rawmsg_data rd; + struct dlm_msg *msg; + char *msgbuf; + + rd.node = node; + rd.buf = buf; + + msg = dlm_lowcomms_new_msg(node->nodeid, buflen, GFP_NOFS, + &msgbuf, midcomms_new_rawmsg_cb, &rd); + if (!msg) + return -ENOMEM; + + memcpy(msgbuf, buf, buflen); + dlm_lowcomms_commit_msg(msg); + return 0; +} + diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h index 579abc6929be2..bc63cf73aa872 100644 --- a/fs/dlm/midcomms.h +++ b/fs/dlm/midcomms.h @@ -28,6 +28,8 @@ const char *dlm_midcomms_state(struct midcomms_node *node); unsigned long dlm_midcomms_flags(struct midcomms_node *node); int dlm_midcomms_send_queue_cnt(struct midcomms_node *node); uint32_t dlm_midcomms_version(struct midcomms_node *node); +int dlm_midcomms_rawmsg_send(struct midcomms_node *node, void *buf, + int buflen); #endif /* __MIDCOMMS_DOT_H__ */ From 75d25ffe380a01b88cb3bf604a6b8dc5a562a2e5 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Tue, 2 Nov 2021 15:17:21 -0400 Subject: [PATCH 15/32] fs: dlm: allow create lkb with specific id range This patch adds functionality to add a lkb with a specific id range. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lock.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index feb2e94f5879e..8b30c9d9e545d 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -1180,7 +1180,8 @@ static void detach_lkb(struct dlm_lkb *lkb) } } -static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret) +static int _create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret, + int start, int end) { struct dlm_lkb *lkb; int rv; @@ -1201,7 +1202,7 @@ static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret) idr_preload(GFP_NOFS); spin_lock(&ls->ls_lkbidr_spin); - rv = idr_alloc(&ls->ls_lkbidr, lkb, 1, 0, GFP_NOWAIT); + rv = idr_alloc(&ls->ls_lkbidr, lkb, start, end, GFP_NOWAIT); if (rv >= 0) lkb->lkb_id = rv; spin_unlock(&ls->ls_lkbidr_spin); @@ -1217,6 +1218,11 @@ static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret) return 0; } +static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret) +{ + return _create_lkb(ls, lkb_ret, 1, 0); +} + static int find_lkb(struct dlm_ls *ls, uint32_t lkid, struct dlm_lkb **lkb_ret) { struct dlm_lkb *lkb; From 5054e79de99984b4f39a073534526bc7c827b1e0 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Tue, 2 Nov 2021 15:17:22 -0400 Subject: [PATCH 16/32] fs: dlm: add lkb debugfs functionality This patch adds functionality to add an lkb during runtime. This is a highly debugging feature only, wrong input can crash the kernel. It is a early state feature as well. The goal is to provide a user interface for manipulate dlm state and combine it with the rawmsg feature. It is debugfs functionality, we don't care about UAPI breakage. Even it's possible to add lkb's/rsb's which could never be exists in such wat by using normal DLM operation. The user of this interface always need to think before using this feature, not every crash which happens can really occur during normal dlm operation. Future there should be more functionality to add a more realistic lkb which reflects normal DLM state inside the kernel. For now this is enough. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/debug_fs.c | 32 +++++++++++++++++++++++++++++++- fs/dlm/lock.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ fs/dlm/lock.h | 2 ++ 3 files changed, 79 insertions(+), 1 deletion(-) diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c index 555904eeea8ea..2ead4751d6556 100644 --- a/fs/dlm/debug_fs.c +++ b/fs/dlm/debug_fs.c @@ -635,6 +635,35 @@ static int table_open2(struct inode *inode, struct file *file) return 0; } +static ssize_t table_write2(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct seq_file *seq = file->private_data; + int n, len, lkb_nodeid, lkb_status, error; + char name[DLM_RESNAME_MAXLEN] = {}; + struct dlm_ls *ls = seq->private; + unsigned int lkb_flags; + char buf[256] = {}; + uint32_t lkb_id; + + if (copy_from_user(buf, user_buf, + min_t(size_t, sizeof(buf) - 1, count))) + return -EFAULT; + + n = sscanf(buf, "%x %" __stringify(DLM_RESNAME_MAXLEN) "s %x %d %d", + &lkb_id, name, &lkb_flags, &lkb_nodeid, &lkb_status); + if (n != 5) + return -EINVAL; + + len = strnlen(name, DLM_RESNAME_MAXLEN); + error = dlm_debug_add_lkb(ls, lkb_id, name, len, lkb_flags, + lkb_nodeid, lkb_status); + if (error) + return error; + + return count; +} + static int table_open3(struct inode *inode, struct file *file) { struct seq_file *seq; @@ -675,6 +704,7 @@ static const struct file_operations format2_fops = { .owner = THIS_MODULE, .open = table_open2, .read = seq_read, + .write = table_write2, .llseek = seq_lseek, .release = seq_release }; @@ -846,7 +876,7 @@ void dlm_create_debug_file(struct dlm_ls *ls) snprintf(name, DLM_LOCKSPACE_LEN + 8, "%s_locks", ls->ls_name); ls->ls_debug_locks_dentry = debugfs_create_file(name, - S_IFREG | S_IRUGO, + 0644, dlm_root, ls, &format2_fops); diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 8b30c9d9e545d..aeb793693d8c5 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -6317,3 +6317,49 @@ int dlm_user_purge(struct dlm_ls *ls, struct dlm_user_proc *proc, return error; } +/* debug functionality */ +int dlm_debug_add_lkb(struct dlm_ls *ls, uint32_t lkb_id, char *name, int len, + int lkb_nodeid, unsigned int lkb_flags, int lkb_status) +{ + struct dlm_lksb *lksb; + struct dlm_lkb *lkb; + struct dlm_rsb *r; + int error; + + /* we currently can't set a valid user lock */ + if (lkb_flags & DLM_IFL_USER) + return -EOPNOTSUPP; + + lksb = kzalloc(sizeof(*lksb), GFP_NOFS); + if (!lksb) + return -ENOMEM; + + error = _create_lkb(ls, &lkb, lkb_id, lkb_id + 1); + if (error) { + kfree(lksb); + return error; + } + + lkb->lkb_flags = lkb_flags; + lkb->lkb_nodeid = lkb_nodeid; + lkb->lkb_lksb = lksb; + /* user specific pointer, just don't have it NULL for kernel locks */ + if (~lkb_flags & DLM_IFL_USER) + lkb->lkb_astparam = (void *)0xDEADBEEF; + + error = find_rsb(ls, name, len, 0, R_REQUEST, &r); + if (error) { + kfree(lksb); + __put_lkb(ls, lkb); + return error; + } + + lock_rsb(r); + attach_lkb(r, lkb); + add_lkb(r, lkb, lkb_status); + unlock_rsb(r); + put_rsb(r); + + return 0; +} + diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h index 456c6ec3ef6f4..863a66e128a22 100644 --- a/fs/dlm/lock.h +++ b/fs/dlm/lock.h @@ -58,6 +58,8 @@ int dlm_user_purge(struct dlm_ls *ls, struct dlm_user_proc *proc, int nodeid, int pid); int dlm_user_deadlock(struct dlm_ls *ls, uint32_t flags, uint32_t lkid); void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc); +int dlm_debug_add_lkb(struct dlm_ls *ls, uint32_t lkb_id, char *name, int len, + int lkb_nodeid, unsigned int lkb_flags, int lkb_status); static inline int is_master(struct dlm_rsb *r) { From 63eab2b00bcff620682e8570367458c9619a9970 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Tue, 2 Nov 2021 15:17:23 -0400 Subject: [PATCH 17/32] fs: dlm: add lkb waiters debugfs functionality This patch adds functionality to put a lkb to the waiters state. It can be useful to combine this feature with the "rawmsg" debugfs functionality. It will bring the DLM lkb into a state that a message will be parsed by the kernel. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/debug_fs.c | 27 ++++++++++++++++++++++++++- fs/dlm/lock.c | 15 +++++++++++++++ fs/dlm/lock.h | 2 ++ 3 files changed, 43 insertions(+), 1 deletion(-) diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c index 2ead4751d6556..df6f3f107be4f 100644 --- a/fs/dlm/debug_fs.c +++ b/fs/dlm/debug_fs.c @@ -754,10 +754,35 @@ static ssize_t waiters_read(struct file *file, char __user *userbuf, return rv; } +static ssize_t waiters_write(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct dlm_ls *ls = file->private_data; + int mstype, to_nodeid; + char buf[128] = {}; + uint32_t lkb_id; + int n, error; + + if (copy_from_user(buf, user_buf, + min_t(size_t, sizeof(buf) - 1, count))) + return -EFAULT; + + n = sscanf(buf, "%x %d %d", &lkb_id, &mstype, &to_nodeid); + if (n != 3) + return -EINVAL; + + error = dlm_debug_add_lkb_to_waiters(ls, lkb_id, mstype, to_nodeid); + if (error) + return error; + + return count; +} + static const struct file_operations waiters_fops = { .owner = THIS_MODULE, .open = simple_open, .read = waiters_read, + .write = waiters_write, .llseek = default_llseek, }; @@ -907,7 +932,7 @@ void dlm_create_debug_file(struct dlm_ls *ls) snprintf(name, DLM_LOCKSPACE_LEN + 8, "%s_waiters", ls->ls_name); ls->ls_debug_waiters_dentry = debugfs_create_file(name, - S_IFREG | S_IRUGO, + 0644, dlm_root, ls, &waiters_fops); diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index aeb793693d8c5..0dbe273566c0b 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -6363,3 +6363,18 @@ int dlm_debug_add_lkb(struct dlm_ls *ls, uint32_t lkb_id, char *name, int len, return 0; } +int dlm_debug_add_lkb_to_waiters(struct dlm_ls *ls, uint32_t lkb_id, + int mstype, int to_nodeid) +{ + struct dlm_lkb *lkb; + int error; + + error = find_lkb(ls, lkb_id, &lkb); + if (error) + return error; + + error = add_to_waiters(lkb, mstype, to_nodeid); + dlm_put_lkb(lkb); + return error; +} + diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h index 863a66e128a22..252a5898f9081 100644 --- a/fs/dlm/lock.h +++ b/fs/dlm/lock.h @@ -60,6 +60,8 @@ int dlm_user_deadlock(struct dlm_ls *ls, uint32_t flags, uint32_t lkid); void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc); int dlm_debug_add_lkb(struct dlm_ls *ls, uint32_t lkb_id, char *name, int len, int lkb_nodeid, unsigned int lkb_flags, int lkb_status); +int dlm_debug_add_lkb_to_waiters(struct dlm_ls *ls, uint32_t lkb_id, + int mstype, int to_nodeid); static inline int is_master(struct dlm_rsb *r) { From 6c2e3bf68f3e5e5a647aa52be246d5f552d7496d Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Tue, 2 Nov 2021 15:17:24 -0400 Subject: [PATCH 18/32] fs: dlm: filter user dlm messages for kernel locks This patch fixes the following crash by receiving a invalid message: [ 160.672220] ================================================================== [ 160.676206] BUG: KASAN: user-memory-access in dlm_user_add_ast+0xc3/0x370 [ 160.679659] Read of size 8 at addr 00000000deadbeef by task kworker/u32:13/319 [ 160.681447] [ 160.681824] CPU: 10 PID: 319 Comm: kworker/u32:13 Not tainted 5.14.0-rc2+ #399 [ 160.683472] Hardware name: Red Hat KVM/RHEL-AV, BIOS 1.14.0-1.module+el8.6.0+12648+6ede71a5 04/01/2014 [ 160.685574] Workqueue: dlm_recv process_recv_sockets [ 160.686721] Call Trace: [ 160.687310] dump_stack_lvl+0x56/0x6f [ 160.688169] ? dlm_user_add_ast+0xc3/0x370 [ 160.689116] kasan_report.cold.14+0x116/0x11b [ 160.690138] ? dlm_user_add_ast+0xc3/0x370 [ 160.690832] dlm_user_add_ast+0xc3/0x370 [ 160.691502] _receive_unlock_reply+0x103/0x170 [ 160.692241] _receive_message+0x11df/0x1ec0 [ 160.692926] ? rcu_read_lock_sched_held+0xa1/0xd0 [ 160.693700] ? rcu_read_lock_bh_held+0xb0/0xb0 [ 160.694427] ? lock_acquire+0x175/0x400 [ 160.695058] ? do_purge.isra.51+0x200/0x200 [ 160.695744] ? lock_acquired+0x360/0x5d0 [ 160.696400] ? lock_contended+0x6a0/0x6a0 [ 160.697055] ? lock_release+0x21d/0x5e0 [ 160.697686] ? lock_is_held_type+0xe0/0x110 [ 160.698352] ? lock_is_held_type+0xe0/0x110 [ 160.699026] ? ___might_sleep+0x1cc/0x1e0 [ 160.699698] ? dlm_wait_requestqueue+0x94/0x140 [ 160.700451] ? dlm_process_requestqueue+0x240/0x240 [ 160.701249] ? down_write_killable+0x2b0/0x2b0 [ 160.701988] ? do_raw_spin_unlock+0xa2/0x130 [ 160.702690] dlm_receive_buffer+0x1a5/0x210 [ 160.703385] dlm_process_incoming_buffer+0x726/0x9f0 [ 160.704210] receive_from_sock+0x1c0/0x3b0 [ 160.704886] ? dlm_tcp_shutdown+0x30/0x30 [ 160.705561] ? lock_acquire+0x175/0x400 [ 160.706197] ? rcu_read_lock_sched_held+0xa1/0xd0 [ 160.706941] ? rcu_read_lock_bh_held+0xb0/0xb0 [ 160.707681] process_recv_sockets+0x32/0x40 [ 160.708366] process_one_work+0x55e/0xad0 [ 160.709045] ? pwq_dec_nr_in_flight+0x110/0x110 [ 160.709820] worker_thread+0x65/0x5e0 [ 160.710423] ? process_one_work+0xad0/0xad0 [ 160.711087] kthread+0x1ed/0x220 [ 160.711628] ? set_kthread_struct+0x80/0x80 [ 160.712314] ret_from_fork+0x22/0x30 The issue is that we received a DLM message for a user lock but the destination lock is a kernel lock. Note that the address which is trying to derefence is 00000000deadbeef, which is in a kernel lock lkb->lkb_astparam, this field should never be derefenced by the DLM kernel stack. In case of a user lock lkb->lkb_astparam is lkb->lkb_ua (memory is shared by a union field). The struct lkb_ua will be handled by the DLM kernel stack but on a kernel lock it will contain invalid data and ends in most likely crashing the kernel. It can be reproduced with two cluster nodes. node 2: dlm_tool join test echo "862 fooobaar 1 2 1" > /sys/kernel/debug/dlm/test_locks echo "862 3 1" > /sys/kernel/debug/dlm/test_waiters node 1: dlm_tool join test python: foo = DLM(h_cmd=3, o_nextcmd=1, h_nodeid=1, h_lockspace=0x77222027, \ m_type=7, m_flags=0x1, m_remid=0x862, m_result=0xFFFEFFFE) newFile = open("/sys/kernel/debug/dlm/comms/2/rawmsg", "wb") newFile.write(bytes(foo)) Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lock.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 0dbe273566c0b..54705d367076b 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -3989,6 +3989,14 @@ static int validate_message(struct dlm_lkb *lkb, struct dlm_message *ms) int from = ms->m_header.h_nodeid; int error = 0; + /* currently mixing of user/kernel locks are not supported */ + if (ms->m_flags & DLM_IFL_USER && ~lkb->lkb_flags & DLM_IFL_USER) { + log_error(lkb->lkb_resource->res_ls, + "got user dlm message for a kernel lock"); + error = -EINVAL; + goto out; + } + switch (ms->m_type) { case DLM_MSG_CONVERT: case DLM_MSG_UNLOCK: @@ -4017,6 +4025,7 @@ static int validate_message(struct dlm_lkb *lkb, struct dlm_message *ms) error = -EINVAL; } +out: if (error) log_error(lkb->lkb_resource->res_ls, "ignore invalid message %d from %d %x %x %x %d", From b87b1883efe385e56384ff48e6f3108a33fde508 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 3 Nov 2021 17:04:18 -0400 Subject: [PATCH 19/32] fs: dlm: remove double list_first_entry call This patch removes a list_first_entry() call which is already done by the previous con_next_wq() call. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 3f8b015ba7990..2f070514b3eed 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1405,7 +1405,6 @@ static void send_to_sock(struct connection *con) if (!e) break; - e = list_first_entry(&con->writequeue, struct writequeue_entry, list); len = e->len; offset = e->offset; BUG_ON(len == 0 && e->users == 0); From c8b9f34e223fcad1e9980f343587f38624331bbc Mon Sep 17 00:00:00 2001 From: Zhang Mingyu Date: Fri, 5 Nov 2021 01:43:20 +0000 Subject: [PATCH 20/32] fs: dlm:Remove unneeded semicolon Eliminate the following coccinelle check warning: fs/dlm/midcomms.c:972:2-3 Reported-by: Zeal Robot Signed-off-by: Zhang Mingyu Signed-off-by: David Teigland --- fs/dlm/midcomms.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index 0b9bce6f04e14..74b4308b912cf 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -969,7 +969,7 @@ void dlm_midcomms_receive_done(int nodeid) spin_unlock(&node->state_lock); /* do nothing FIN has it's own ack send */ break; - }; + } srcu_read_unlock(&nodes_srcu, idx); } From 6a628fa43810f861da50c593c69f2ead1c829231 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Fri, 12 Nov 2021 10:08:01 -0500 Subject: [PATCH 21/32] fs: dlm: fix potential buffer overflow This patch fixes an potential overflow in sscanf and the maximum declared string parsing length which seems to be excluding the null termination symbol. This patch will just add one byte to be prepared on a string with length of DLM_RESNAME_MAXLEN including the null termination symbol. Fixes: 5054e79de999 ("fs: dlm: add lkb debugfs functionality") Reported-by: kernel test robot Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/debug_fs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c index df6f3f107be4f..8fb04ebbafb5d 100644 --- a/fs/dlm/debug_fs.c +++ b/fs/dlm/debug_fs.c @@ -640,7 +640,7 @@ static ssize_t table_write2(struct file *file, const char __user *user_buf, { struct seq_file *seq = file->private_data; int n, len, lkb_nodeid, lkb_status, error; - char name[DLM_RESNAME_MAXLEN] = {}; + char name[DLM_RESNAME_MAXLEN + 1] = {}; struct dlm_ls *ls = seq->private; unsigned int lkb_flags; char buf[256] = {}; From 4c3d90570bcc2b338f70f61f01110268e281ca3c Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Mon, 15 Nov 2021 08:57:05 -0500 Subject: [PATCH 22/32] fs: dlm: don't call kernel_getpeername() in error_report() In some cases kernel_getpeername() will held the socket lock which is already held when the socket layer calls error_report() callback. Since commit 9dfc685e0262 ("inet: remove races in inet{6}_getname()") this problem becomes more likely because the socket lock will be held always. You will see something like: bob9-u5 login: [ 562.316860] BUG: spinlock recursion on CPU#7, swapper/7/0 [ 562.318562] lock: 0xffff8f2284720088, .magic: dead4ead, .owner: swapper/7/0, .owner_cpu: 7 [ 562.319522] CPU: 7 PID: 0 Comm: swapper/7 Not tainted 5.15.0+ #135 [ 562.320346] Hardware name: Red Hat KVM/RHEL-AV, BIOS 1.13.0-2.module+el8.3.0+7353+9de0a3cc 04/01/2014 [ 562.321277] Call Trace: [ 562.321529] [ 562.321734] dump_stack_lvl+0x33/0x42 [ 562.322282] do_raw_spin_lock+0x8b/0xc0 [ 562.322674] lock_sock_nested+0x1e/0x50 [ 562.323057] inet_getname+0x39/0x110 [ 562.323425] ? sock_def_readable+0x80/0x80 [ 562.323838] lowcomms_error_report+0x63/0x260 [dlm] [ 562.324338] ? wait_for_completion_interruptible_timeout+0xd2/0x120 [ 562.324949] ? lock_timer_base+0x67/0x80 [ 562.325330] ? do_raw_spin_unlock+0x49/0xc0 [ 562.325735] ? _raw_spin_unlock_irqrestore+0x1e/0x40 [ 562.326218] ? del_timer+0x54/0x80 [ 562.326549] sk_error_report+0x12/0x70 [ 562.326919] tcp_validate_incoming+0x3c8/0x530 [ 562.327347] ? kvm_clock_read+0x14/0x30 [ 562.327718] ? ktime_get+0x3b/0xa0 [ 562.328055] tcp_rcv_established+0x121/0x660 [ 562.328466] tcp_v4_do_rcv+0x132/0x260 [ 562.328835] tcp_v4_rcv+0xcea/0xe20 [ 562.329173] ip_protocol_deliver_rcu+0x35/0x1f0 [ 562.329615] ip_local_deliver_finish+0x54/0x60 [ 562.330050] ip_local_deliver+0xf7/0x110 [ 562.330431] ? inet_rtm_getroute+0x211/0x840 [ 562.330848] ? ip_protocol_deliver_rcu+0x1f0/0x1f0 [ 562.331310] ip_rcv+0xe1/0xf0 [ 562.331603] ? ip_local_deliver+0x110/0x110 [ 562.332011] __netif_receive_skb_core+0x46a/0x1040 [ 562.332476] ? inet_gro_receive+0x263/0x2e0 [ 562.332885] __netif_receive_skb_list_core+0x13b/0x2c0 [ 562.333383] netif_receive_skb_list_internal+0x1c8/0x2f0 [ 562.333896] ? update_load_avg+0x7e/0x5e0 [ 562.334285] gro_normal_list.part.149+0x19/0x40 [ 562.334722] napi_complete_done+0x67/0x160 [ 562.335134] virtnet_poll+0x2ad/0x408 [virtio_net] [ 562.335644] __napi_poll+0x28/0x140 [ 562.336012] net_rx_action+0x23d/0x300 [ 562.336414] __do_softirq+0xf2/0x2ea [ 562.336803] irq_exit_rcu+0xc1/0xf0 [ 562.337173] common_interrupt+0xb9/0xd0 It is and was always forbidden to call kernel_getpeername() in context of error_report(). To get rid of the problem we access the destination address for the peer over the socket structure. While on it we fix to print out the destination port of the inet socket. Fixes: 1a31833d085a ("DLM: Replace nodeid_to_addr with kernel_getpeername") Reported-by: Bob Peterson Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 42 ++++++++++++++++++++---------------------- 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 2f070514b3eed..c7750849c4954 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -594,8 +594,8 @@ int dlm_lowcomms_nodes_set_mark(int nodeid, unsigned int mark) static void lowcomms_error_report(struct sock *sk) { struct connection *con; - struct sockaddr_storage saddr; void (*orig_report)(struct sock *) = NULL; + struct inet_sock *inet; read_lock_bh(&sk->sk_callback_lock); con = sock2con(sk); @@ -603,33 +603,31 @@ static void lowcomms_error_report(struct sock *sk) goto out; orig_report = listen_sock.sk_error_report; - if (kernel_getpeername(sk->sk_socket, (struct sockaddr *)&saddr) < 0) { - printk_ratelimited(KERN_ERR "dlm: node %d: socket error " - "sending to node %d, port %d, " - "sk_err=%d/%d\n", dlm_our_nodeid(), - con->nodeid, dlm_config.ci_tcp_port, - sk->sk_err, sk->sk_err_soft); - } else if (saddr.ss_family == AF_INET) { - struct sockaddr_in *sin4 = (struct sockaddr_in *)&saddr; + inet = inet_sk(sk); + switch (sk->sk_family) { + case AF_INET: printk_ratelimited(KERN_ERR "dlm: node %d: socket error " - "sending to node %d at %pI4, port %d, " + "sending to node %d at %pI4, dport %d, " "sk_err=%d/%d\n", dlm_our_nodeid(), - con->nodeid, &sin4->sin_addr.s_addr, - dlm_config.ci_tcp_port, sk->sk_err, + con->nodeid, &inet->inet_daddr, + ntohs(inet->inet_dport), sk->sk_err, sk->sk_err_soft); - } else { - struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&saddr; - + break; + case AF_INET6: printk_ratelimited(KERN_ERR "dlm: node %d: socket error " - "sending to node %d at %u.%u.%u.%u, " - "port %d, sk_err=%d/%d\n", dlm_our_nodeid(), - con->nodeid, sin6->sin6_addr.s6_addr32[0], - sin6->sin6_addr.s6_addr32[1], - sin6->sin6_addr.s6_addr32[2], - sin6->sin6_addr.s6_addr32[3], - dlm_config.ci_tcp_port, sk->sk_err, + "sending to node %d at %pI6c, " + "dport %d, sk_err=%d/%d\n", dlm_our_nodeid(), + con->nodeid, &sk->sk_v6_daddr, + ntohs(inet->inet_dport), sk->sk_err, sk->sk_err_soft); + break; + default: + printk_ratelimited(KERN_ERR "dlm: node %d: socket error " + "invalid socket family %d set, " + "sk_err=%d/%d\n", dlm_our_nodeid(), + sk->sk_family, sk->sk_err, sk->sk_err_soft); + goto out; } /* below sendcon only handling */ From 92c44605381418b01af44c63fd27185cac368866 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Mon, 15 Nov 2021 08:57:06 -0500 Subject: [PATCH 23/32] fs: dlm: replace use of socket sk_callback_lock with sock_lock This patch will replace the use of socket sk_callback_lock lock and uses socket lock instead. Some users like sunrpc, see commit ea9afca88bbe ("SUNRPC: Replace use of socket sk_callback_lock with sock_lock") moving from sk_callback_lock to sock_lock which seems to be held when the socket callbacks are called. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 27 ++++++++++----------------- 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index c7750849c4954..2034701890111 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -488,11 +488,9 @@ static void lowcomms_data_ready(struct sock *sk) { struct connection *con; - read_lock_bh(&sk->sk_callback_lock); con = sock2con(sk); if (con && !test_and_set_bit(CF_READ_PENDING, &con->flags)) queue_work(recv_workqueue, &con->rwork); - read_unlock_bh(&sk->sk_callback_lock); } static void lowcomms_listen_data_ready(struct sock *sk) @@ -507,15 +505,14 @@ static void lowcomms_write_space(struct sock *sk) { struct connection *con; - read_lock_bh(&sk->sk_callback_lock); con = sock2con(sk); if (!con) - goto out; + return; if (!test_and_set_bit(CF_CONNECTED, &con->flags)) { log_print("successful connected to node %d", con->nodeid); queue_work(send_workqueue, &con->swork); - goto out; + return; } clear_bit(SOCK_NOSPACE, &con->sock->flags); @@ -526,8 +523,6 @@ static void lowcomms_write_space(struct sock *sk) } queue_work(send_workqueue, &con->swork); -out: - read_unlock_bh(&sk->sk_callback_lock); } static inline void lowcomms_connect_sock(struct connection *con) @@ -597,7 +592,6 @@ static void lowcomms_error_report(struct sock *sk) void (*orig_report)(struct sock *) = NULL; struct inet_sock *inet; - read_lock_bh(&sk->sk_callback_lock); con = sock2con(sk); if (con == NULL) goto out; @@ -646,7 +640,6 @@ static void lowcomms_error_report(struct sock *sk) queue_work(send_workqueue, &con->swork); out: - read_unlock_bh(&sk->sk_callback_lock); if (orig_report) orig_report(sk); } @@ -666,20 +659,20 @@ static void restore_callbacks(struct socket *sock) { struct sock *sk = sock->sk; - write_lock_bh(&sk->sk_callback_lock); + lock_sock(sk); sk->sk_user_data = NULL; sk->sk_data_ready = listen_sock.sk_data_ready; sk->sk_state_change = listen_sock.sk_state_change; sk->sk_write_space = listen_sock.sk_write_space; sk->sk_error_report = listen_sock.sk_error_report; - write_unlock_bh(&sk->sk_callback_lock); + release_sock(sk); } static void add_listen_sock(struct socket *sock, struct listen_connection *con) { struct sock *sk = sock->sk; - write_lock_bh(&sk->sk_callback_lock); + lock_sock(sk); save_listen_callbacks(sock); con->sock = sock; @@ -687,7 +680,7 @@ static void add_listen_sock(struct socket *sock, struct listen_connection *con) sk->sk_allocation = GFP_NOFS; /* Install a data_ready callback */ sk->sk_data_ready = lowcomms_listen_data_ready; - write_unlock_bh(&sk->sk_callback_lock); + release_sock(sk); } /* Make a socket active */ @@ -695,7 +688,7 @@ static void add_sock(struct socket *sock, struct connection *con) { struct sock *sk = sock->sk; - write_lock_bh(&sk->sk_callback_lock); + lock_sock(sk); con->sock = sock; sk->sk_user_data = con; @@ -705,7 +698,7 @@ static void add_sock(struct socket *sock, struct connection *con) sk->sk_state_change = lowcomms_state_change; sk->sk_allocation = GFP_NOFS; sk->sk_error_report = lowcomms_error_report; - write_unlock_bh(&sk->sk_callback_lock); + release_sock(sk); } /* Add the port number to an IPv6 or 4 sockaddr and return the address @@ -1680,9 +1673,9 @@ static void _stop_conn(struct connection *con, bool and_other) set_bit(CF_READ_PENDING, &con->flags); set_bit(CF_WRITE_PENDING, &con->flags); if (con->sock && con->sock->sk) { - write_lock_bh(&con->sock->sk->sk_callback_lock); + lock_sock(con->sock->sk); con->sock->sk->sk_user_data = NULL; - write_unlock_bh(&con->sock->sk->sk_callback_lock); + release_sock(con->sock->sk); } if (con->othercon && and_other) _stop_conn(con->othercon, false); From 1b9beda83e27a0c2cd75d1cb743c297c7b36c844 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 17 Nov 2021 09:20:43 -0500 Subject: [PATCH 24/32] fs: dlm: fix build with CONFIG_IPV6 disabled This patch will surround the AF_INET6 case in sk_error_report() of dlm with a #if IS_ENABLED(CONFIG_IPV6). The field sk->sk_v6_daddr is not defined when CONFIG_IPV6 is disabled. If CONFIG_IPV6 is disabled, the socket creation with AF_INET6 should already fail because a runtime check if AF_INET6 is registered. However if there is the possibility that AF_INET6 is set as sk_family the sk_error_report() callback will print then an invalid family type error. Reported-by: kernel test robot Fixes: 4c3d90570bcc ("fs: dlm: don't call kernel_getpeername() in error_report()") Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 2034701890111..f7fc1ac76ce83 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -608,6 +608,7 @@ static void lowcomms_error_report(struct sock *sk) ntohs(inet->inet_dport), sk->sk_err, sk->sk_err_soft); break; +#if IS_ENABLED(CONFIG_IPV6) case AF_INET6: printk_ratelimited(KERN_ERR "dlm: node %d: socket error " "sending to node %d at %pI6c, " @@ -616,6 +617,7 @@ static void lowcomms_error_report(struct sock *sk) ntohs(inet->inet_dport), sk->sk_err, sk->sk_err_soft); break; +#endif default: printk_ratelimited(KERN_ERR "dlm: node %d: socket error " "invalid socket family %d set, " From f70813d6a5fce7bde411272cfe1ab565a4254266 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Tue, 30 Nov 2021 14:47:14 -0500 Subject: [PATCH 25/32] fs: dlm: use list_empty() to check last iteration This patch will use list_empty(&ls->ls_cb_delay) to check for last list iteration. In case of a multiply count of MAX_CB_QUEUE and the list is empty we do a extra goto more which we can avoid by checking on list_empty(). Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/ast.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c index 27bae7d4a477a..bfac462dd3e8f 100644 --- a/fs/dlm/ast.c +++ b/fs/dlm/ast.c @@ -300,6 +300,7 @@ void dlm_callback_resume(struct dlm_ls *ls) { struct dlm_lkb *lkb, *safe; int count = 0, sum = 0; + bool empty; clear_bit(LSFL_CB_DELAY, &ls->ls_flags); @@ -315,10 +316,11 @@ void dlm_callback_resume(struct dlm_ls *ls) if (count == MAX_CB_QUEUE) break; } + empty = list_empty(&ls->ls_cb_delay); mutex_unlock(&ls->ls_cb_mutex); sum += count; - if (count == MAX_CB_QUEUE) { + if (!empty) { count = 0; cond_resched(); goto more; From bcbfea41e1f9d516faed1faf0f2d390c000bf0d9 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Tue, 30 Nov 2021 14:47:15 -0500 Subject: [PATCH 26/32] fs: dlm: check for pending users filling buffers Currently we don't care if the DLM application stack is filling buffers (not committed yet) while we transmit some already committed buffers. By checking on active writequeue users before dequeue a writequeue entry we know there is coming more data and do nothing. We wait until the send worker will be triggered again if the writequeue entry users hit zero. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index f7fc1ac76ce83..6d500ebc61453 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -201,7 +201,10 @@ static struct writequeue_entry *con_next_wq(struct connection *con) e = list_first_entry(&con->writequeue, struct writequeue_entry, list); - if (e->len == 0) + /* if len is zero nothing is to send, if there are users filling + * buffers we wait until the users are done so we can send more. + */ + if (e->users || e->len == 0) return NULL; return e; From 21d9ac1a5376d949199398848006f6b14649f533 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Tue, 30 Nov 2021 14:47:16 -0500 Subject: [PATCH 27/32] fs: dlm: use event based wait for pending remove This patch will use an event based waitqueue to wait for a possible clash with the ls_remove_name field of dlm_ls instead of doing busy waiting. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/dlm_internal.h | 1 + fs/dlm/lock.c | 19 ++++++++++++------- fs/dlm/lockspace.c | 1 + 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 019931804af9b..74a9590a4dd5b 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -576,6 +576,7 @@ struct dlm_ls { struct list_head ls_new_rsb; /* new rsb structs */ spinlock_t ls_remove_spin; + wait_queue_head_t ls_remove_wait; char ls_remove_name[DLM_RESNAME_MAXLEN+1]; char *ls_remove_names[DLM_REMOVE_NAMES_MAX]; int ls_remove_len; diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 54705d367076b..bdb51d209ba25 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -1626,21 +1626,24 @@ static int remove_from_waiters_ms(struct dlm_lkb *lkb, struct dlm_message *ms) } /* If there's an rsb for the same resource being removed, ensure - that the remove message is sent before the new lookup message. - It should be rare to need a delay here, but if not, then it may - be worthwhile to add a proper wait mechanism rather than a delay. */ + * that the remove message is sent before the new lookup message. + */ + +#define DLM_WAIT_PENDING_COND(ls, r) \ + (ls->ls_remove_len && \ + !rsb_cmp(r, ls->ls_remove_name, \ + ls->ls_remove_len)) static void wait_pending_remove(struct dlm_rsb *r) { struct dlm_ls *ls = r->res_ls; restart: spin_lock(&ls->ls_remove_spin); - if (ls->ls_remove_len && - !rsb_cmp(r, ls->ls_remove_name, ls->ls_remove_len)) { + if (DLM_WAIT_PENDING_COND(ls, r)) { log_debug(ls, "delay lookup for remove dir %d %s", - r->res_dir_nodeid, r->res_name); + r->res_dir_nodeid, r->res_name); spin_unlock(&ls->ls_remove_spin); - msleep(1); + wait_event(ls->ls_remove_wait, !DLM_WAIT_PENDING_COND(ls, r)); goto restart; } spin_unlock(&ls->ls_remove_spin); @@ -1792,6 +1795,7 @@ static void shrink_bucket(struct dlm_ls *ls, int b) memcpy(ls->ls_remove_name, name, DLM_RESNAME_MAXLEN); spin_unlock(&ls->ls_remove_spin); spin_unlock(&ls->ls_rsbtbl[b].lock); + wake_up(&ls->ls_remove_wait); send_remove(r); @@ -4075,6 +4079,7 @@ static void send_repeat_remove(struct dlm_ls *ls, char *ms_name, int len) memcpy(ls->ls_remove_name, name, DLM_RESNAME_MAXLEN); spin_unlock(&ls->ls_remove_spin); spin_unlock(&ls->ls_rsbtbl[b].lock); + wake_up(&ls->ls_remove_wait); rv = _create_message(ls, sizeof(struct dlm_message) + len, dir_nodeid, DLM_MSG_REMOVE, &ms, &mh); diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 2e51bd2bdacce..31384e7d6f90a 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -512,6 +512,7 @@ static int new_lockspace(const char *name, const char *cluster, } spin_lock_init(&ls->ls_remove_spin); + init_waitqueue_head(&ls->ls_remove_wait); for (i = 0; i < DLM_REMOVE_NAMES_MAX; i++) { ls->ls_remove_names[i] = kzalloc(DLM_RESNAME_MAXLEN+1, From be3b0400edbf68556cd390125e2c868988616391 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Tue, 30 Nov 2021 14:47:17 -0500 Subject: [PATCH 28/32] fs: dlm: remove wq_alloc mutex This patch cleanups the code for allocating a new buffer in the dlm writequeue mechanism. There was a possible tuneup to allow scheduling while a new writequeue entry needs to be allocated because either no sending page is available or are full. To avoid multiple concurrent users checking at the same time if an entry is available or full alloc_wq was introduce that those are waiting if there is currently a new writequeue entry in process to be queued so possible further users will check on the new allocated writequeue entry if it's full. To simplify the code we just remove this mutex and switch that the already introduced spin lock will be held during writequeue check, allocation and queueing. So other users can never check on available writequeues while there is a new one in process but not queued yet. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 48 +++++++++++------------------------------------ 1 file changed, 11 insertions(+), 37 deletions(-) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 6d500ebc61453..4919faf797097 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -86,7 +86,6 @@ struct connection { struct list_head writequeue; /* List of outgoing writequeue_entries */ spinlock_t writequeue_lock; atomic_t writequeue_cnt; - struct mutex wq_alloc; int retries; #define MAX_CONNECT_RETRIES 3 struct hlist_node list; @@ -270,8 +269,6 @@ static struct connection *nodeid2con(int nodeid, gfp_t alloc) return NULL; } - mutex_init(&con->wq_alloc); - spin_lock(&connections_lock); /* Because multiple workqueues/threads calls this function it can * race on multiple cpu's. Instead of locking hot path __find_con() @@ -1176,16 +1173,15 @@ static void deinit_local(void) kfree(dlm_local_addr[i]); } -static struct writequeue_entry *new_writequeue_entry(struct connection *con, - gfp_t allocation) +static struct writequeue_entry *new_writequeue_entry(struct connection *con) { struct writequeue_entry *entry; - entry = kzalloc(sizeof(*entry), allocation); + entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) return NULL; - entry->page = alloc_page(allocation | __GFP_ZERO); + entry->page = alloc_page(GFP_ATOMIC | __GFP_ZERO); if (!entry->page) { kfree(entry); return NULL; @@ -1200,8 +1196,8 @@ static struct writequeue_entry *new_writequeue_entry(struct connection *con, } static struct writequeue_entry *new_wq_entry(struct connection *con, int len, - gfp_t allocation, char **ppc, - void (*cb)(void *data), void *data) + char **ppc, void (*cb)(void *data), + void *data) { struct writequeue_entry *e; @@ -1217,29 +1213,25 @@ static struct writequeue_entry *new_wq_entry(struct connection *con, int len, e->end += len; e->users++; - spin_unlock(&con->writequeue_lock); - - return e; + goto out; } } - spin_unlock(&con->writequeue_lock); - e = new_writequeue_entry(con, allocation); + e = new_writequeue_entry(con); if (!e) - return NULL; + goto out; kref_get(&e->ref); *ppc = page_address(e->page); e->end += len; atomic_inc(&con->writequeue_cnt); - - spin_lock(&con->writequeue_lock); if (cb) cb(data); list_add_tail(&e->list, &con->writequeue); - spin_unlock(&con->writequeue_lock); +out: + spin_unlock(&con->writequeue_lock); return e; }; @@ -1250,37 +1242,19 @@ static struct dlm_msg *dlm_lowcomms_new_msg_con(struct connection *con, int len, { struct writequeue_entry *e; struct dlm_msg *msg; - bool sleepable; msg = kzalloc(sizeof(*msg), allocation); if (!msg) return NULL; - /* this mutex is being used as a wait to avoid multiple "fast" - * new writequeue page list entry allocs in new_wq_entry in - * normal operation which is sleepable context. Without it - * we could end in multiple writequeue entries with one - * dlm message because multiple callers were waiting at - * the writequeue_lock in new_wq_entry(). - */ - sleepable = gfpflags_normal_context(allocation); - if (sleepable) - mutex_lock(&con->wq_alloc); - kref_init(&msg->ref); - e = new_wq_entry(con, len, allocation, ppc, cb, data); + e = new_wq_entry(con, len, ppc, cb, data); if (!e) { - if (sleepable) - mutex_unlock(&con->wq_alloc); - kfree(msg); return NULL; } - if (sleepable) - mutex_unlock(&con->wq_alloc); - msg->ppc = *ppc; msg->len = len; msg->entry = e; From 6c547f264077ffeb56390f42ed2a07749dd619b2 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Tue, 30 Nov 2021 14:47:18 -0500 Subject: [PATCH 29/32] fs: dlm: memory cache for midcomms hotpath This patch will introduce a kmem cache for allocating message handles which are needed for midcomms layer to take track of lowcomms messages. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/memory.c | 31 ++++++++++++++++++++++++++----- fs/dlm/memory.h | 2 ++ fs/dlm/midcomms.c | 21 +++++++++++++++------ fs/dlm/midcomms.h | 1 + 4 files changed, 44 insertions(+), 11 deletions(-) diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c index 5918f4d395869..8996c6453ad5c 100644 --- a/fs/dlm/memory.c +++ b/fs/dlm/memory.c @@ -10,32 +10,44 @@ ******************************************************************************/ #include "dlm_internal.h" +#include "midcomms.h" #include "config.h" #include "memory.h" +static struct kmem_cache *mhandle_cache; static struct kmem_cache *lkb_cache; static struct kmem_cache *rsb_cache; int __init dlm_memory_init(void) { + mhandle_cache = dlm_midcomms_cache_create(); + if (!mhandle_cache) + goto out; + lkb_cache = kmem_cache_create("dlm_lkb", sizeof(struct dlm_lkb), __alignof__(struct dlm_lkb), 0, NULL); if (!lkb_cache) - return -ENOMEM; + goto lkb; rsb_cache = kmem_cache_create("dlm_rsb", sizeof(struct dlm_rsb), __alignof__(struct dlm_rsb), 0, NULL); - if (!rsb_cache) { - kmem_cache_destroy(lkb_cache); - return -ENOMEM; - } + if (!rsb_cache) + goto rsb; return 0; + +rsb: + kmem_cache_destroy(lkb_cache); +lkb: + kmem_cache_destroy(mhandle_cache); +out: + return -ENOMEM; } void dlm_memory_exit(void) { + kmem_cache_destroy(mhandle_cache); kmem_cache_destroy(lkb_cache); kmem_cache_destroy(rsb_cache); } @@ -89,3 +101,12 @@ void dlm_free_lkb(struct dlm_lkb *lkb) kmem_cache_free(lkb_cache, lkb); } +struct dlm_mhandle *dlm_allocate_mhandle(void) +{ + return kmem_cache_alloc(mhandle_cache, GFP_NOFS); +} + +void dlm_free_mhandle(struct dlm_mhandle *mhandle) +{ + kmem_cache_free(mhandle_cache, mhandle); +} diff --git a/fs/dlm/memory.h b/fs/dlm/memory.h index 4f218ea4b187d..c4d46be778a29 100644 --- a/fs/dlm/memory.h +++ b/fs/dlm/memory.h @@ -20,6 +20,8 @@ struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls); void dlm_free_lkb(struct dlm_lkb *l); char *dlm_allocate_lvb(struct dlm_ls *ls); void dlm_free_lvb(char *l); +struct dlm_mhandle *dlm_allocate_mhandle(void); +void dlm_free_mhandle(struct dlm_mhandle *mhandle); #endif /* __MEMORY_DOT_H__ */ diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index 74b4308b912cf..3635e42b06696 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -137,6 +137,7 @@ #include "dlm_internal.h" #include "lowcomms.h" #include "config.h" +#include "memory.h" #include "lock.h" #include "util.h" #include "midcomms.h" @@ -220,6 +221,12 @@ DEFINE_STATIC_SRCU(nodes_srcu); */ static DEFINE_MUTEX(close_lock); +struct kmem_cache *dlm_midcomms_cache_create(void) +{ + return kmem_cache_create("dlm_mhandle", sizeof(struct dlm_mhandle), + 0, 0, NULL); +} + static inline const char *dlm_state_str(int state) { switch (state) { @@ -279,7 +286,7 @@ static void dlm_mhandle_release(struct rcu_head *rcu) struct dlm_mhandle *mh = container_of(rcu, struct dlm_mhandle, rcu); dlm_lowcomms_put_msg(mh->msg); - kfree(mh); + dlm_free_mhandle(mh); } static void dlm_mhandle_delete(struct midcomms_node *node, @@ -1073,10 +1080,12 @@ struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len, /* this is a bug, however we going on and hope it will be resolved */ WARN_ON(test_bit(DLM_NODE_FLAG_STOP_TX, &node->flags)); - mh = kzalloc(sizeof(*mh), GFP_NOFS); + mh = dlm_allocate_mhandle(); if (!mh) goto err; + mh->committed = false; + mh->ack_rcv = NULL; mh->idx = idx; mh->node = node; @@ -1085,7 +1094,7 @@ struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len, msg = dlm_lowcomms_new_msg(nodeid, len, allocation, ppc, NULL, NULL); if (!msg) { - kfree(mh); + dlm_free_mhandle(mh); goto err; } @@ -1094,13 +1103,13 @@ struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len, msg = dlm_midcomms_get_msg_3_2(mh, nodeid, len, allocation, ppc); if (!msg) { - kfree(mh); + dlm_free_mhandle(mh); goto err; } break; default: - kfree(mh); + dlm_free_mhandle(mh); WARN_ON(1); goto err; } @@ -1136,7 +1145,7 @@ void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh) dlm_lowcomms_commit_msg(mh->msg); dlm_lowcomms_put_msg(mh->msg); /* mh is not part of rcu list in this case */ - kfree(mh); + dlm_free_mhandle(mh); break; case DLM_VERSION_3_2: dlm_midcomms_commit_msg_3_2(mh); diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h index bc63cf73aa872..82bcd96619228 100644 --- a/fs/dlm/midcomms.h +++ b/fs/dlm/midcomms.h @@ -30,6 +30,7 @@ int dlm_midcomms_send_queue_cnt(struct midcomms_node *node); uint32_t dlm_midcomms_version(struct midcomms_node *node); int dlm_midcomms_rawmsg_send(struct midcomms_node *node, void *buf, int buflen); +struct kmem_cache *dlm_midcomms_cache_create(void); #endif /* __MIDCOMMS_DOT_H__ */ From 3af2326ca0a13cf84aeb75e001e757ff3cefeae9 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Tue, 30 Nov 2021 14:47:19 -0500 Subject: [PATCH 30/32] fs: dlm: memory cache for writequeue_entry This patch introduces a kmem cache for writequeue entry. A writequeue entry get quite a lot allocated if dlm transmit messages. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 26 +++++++++++++++++++++----- fs/dlm/lowcomms.h | 1 + fs/dlm/memory.c | 21 ++++++++++++++++++++- fs/dlm/memory.h | 2 ++ 4 files changed, 44 insertions(+), 6 deletions(-) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 4919faf797097..300f44c5d1326 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -58,6 +58,7 @@ #include "dlm_internal.h" #include "lowcomms.h" #include "midcomms.h" +#include "memory.h" #include "config.h" #define NEEDED_RMEM (4*1024*1024) @@ -190,6 +191,19 @@ static const struct dlm_proto_ops *dlm_proto_ops; static void process_recv_sockets(struct work_struct *work); static void process_send_sockets(struct work_struct *work); +static void writequeue_entry_ctor(void *data) +{ + struct writequeue_entry *entry = data; + + INIT_LIST_HEAD(&entry->msgs); +} + +struct kmem_cache *dlm_lowcomms_writequeue_cache_create(void) +{ + return kmem_cache_create("dlm_writequeue", sizeof(struct writequeue_entry), + 0, 0, writequeue_entry_ctor); +} + /* need to held writequeue_lock */ static struct writequeue_entry *con_next_wq(struct connection *con) { @@ -728,7 +742,7 @@ static void dlm_page_release(struct kref *kref) ref); __free_page(e->page); - kfree(e); + dlm_free_writequeue(e); } static void dlm_msg_release(struct kref *kref) @@ -1177,21 +1191,23 @@ static struct writequeue_entry *new_writequeue_entry(struct connection *con) { struct writequeue_entry *entry; - entry = kzalloc(sizeof(*entry), GFP_ATOMIC); + entry = dlm_allocate_writequeue(); if (!entry) return NULL; entry->page = alloc_page(GFP_ATOMIC | __GFP_ZERO); if (!entry->page) { - kfree(entry); + dlm_free_writequeue(entry); return NULL; } + entry->offset = 0; + entry->len = 0; + entry->end = 0; + entry->dirty = false; entry->con = con; entry->users = 1; kref_init(&entry->ref); - INIT_LIST_HEAD(&entry->msgs); - return entry; } diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h index 8108ea24ec301..6c8f4ce457f05 100644 --- a/fs/dlm/lowcomms.h +++ b/fs/dlm/lowcomms.h @@ -47,6 +47,7 @@ int dlm_lowcomms_connect_node(int nodeid); int dlm_lowcomms_nodes_set_mark(int nodeid, unsigned int mark); int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len); void dlm_midcomms_receive_done(int nodeid); +struct kmem_cache *dlm_lowcomms_writequeue_cache_create(void); #endif /* __LOWCOMMS_DOT_H__ */ diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c index 8996c6453ad5c..94af986e83c6d 100644 --- a/fs/dlm/memory.c +++ b/fs/dlm/memory.c @@ -11,9 +11,11 @@ #include "dlm_internal.h" #include "midcomms.h" +#include "lowcomms.h" #include "config.h" #include "memory.h" +static struct kmem_cache *writequeue_cache; static struct kmem_cache *mhandle_cache; static struct kmem_cache *lkb_cache; static struct kmem_cache *rsb_cache; @@ -21,9 +23,13 @@ static struct kmem_cache *rsb_cache; int __init dlm_memory_init(void) { + writequeue_cache = dlm_lowcomms_writequeue_cache_create(); + if (!writequeue_cache) + goto out; + mhandle_cache = dlm_midcomms_cache_create(); if (!mhandle_cache) - goto out; + goto mhandle; lkb_cache = kmem_cache_create("dlm_lkb", sizeof(struct dlm_lkb), __alignof__(struct dlm_lkb), 0, NULL); @@ -41,12 +47,15 @@ int __init dlm_memory_init(void) kmem_cache_destroy(lkb_cache); lkb: kmem_cache_destroy(mhandle_cache); +mhandle: + kmem_cache_destroy(writequeue_cache); out: return -ENOMEM; } void dlm_memory_exit(void) { + kmem_cache_destroy(writequeue_cache); kmem_cache_destroy(mhandle_cache); kmem_cache_destroy(lkb_cache); kmem_cache_destroy(rsb_cache); @@ -110,3 +119,13 @@ void dlm_free_mhandle(struct dlm_mhandle *mhandle) { kmem_cache_free(mhandle_cache, mhandle); } + +struct writequeue_entry *dlm_allocate_writequeue(void) +{ + return kmem_cache_alloc(writequeue_cache, GFP_ATOMIC); +} + +void dlm_free_writequeue(struct writequeue_entry *writequeue) +{ + kmem_cache_free(writequeue_cache, writequeue); +} diff --git a/fs/dlm/memory.h b/fs/dlm/memory.h index c4d46be778a29..854269eacd445 100644 --- a/fs/dlm/memory.h +++ b/fs/dlm/memory.h @@ -22,6 +22,8 @@ char *dlm_allocate_lvb(struct dlm_ls *ls); void dlm_free_lvb(char *l); struct dlm_mhandle *dlm_allocate_mhandle(void); void dlm_free_mhandle(struct dlm_mhandle *mhandle); +struct writequeue_entry *dlm_allocate_writequeue(void); +void dlm_free_writequeue(struct writequeue_entry *writequeue); #endif /* __MEMORY_DOT_H__ */ From e4dc81ed5a8069b8ae56116058ebbad77ff559ec Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Tue, 30 Nov 2021 14:47:20 -0500 Subject: [PATCH 31/32] fs: dlm: memory cache for lowcomms hotpath This patch introduces a kmem cache for dlm_msg handles which are used always if dlm sends a message out. Even if their are covered by midcomms layer or not. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 13 ++++++++++--- fs/dlm/lowcomms.h | 1 + fs/dlm/memory.c | 18 ++++++++++++++++++ fs/dlm/memory.h | 2 ++ 4 files changed, 31 insertions(+), 3 deletions(-) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 300f44c5d1326..23a1ff6907252 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -204,6 +204,11 @@ struct kmem_cache *dlm_lowcomms_writequeue_cache_create(void) 0, 0, writequeue_entry_ctor); } +struct kmem_cache *dlm_lowcomms_msg_cache_create(void) +{ + return kmem_cache_create("dlm_msg", sizeof(struct dlm_msg), 0, 0, NULL); +} + /* need to held writequeue_lock */ static struct writequeue_entry *con_next_wq(struct connection *con) { @@ -750,7 +755,7 @@ static void dlm_msg_release(struct kref *kref) struct dlm_msg *msg = container_of(kref, struct dlm_msg, ref); kref_put(&msg->entry->ref, dlm_page_release); - kfree(msg); + dlm_free_msg(msg); } static void free_entry(struct writequeue_entry *e) @@ -1259,7 +1264,7 @@ static struct dlm_msg *dlm_lowcomms_new_msg_con(struct connection *con, int len, struct writequeue_entry *e; struct dlm_msg *msg; - msg = kzalloc(sizeof(*msg), allocation); + msg = dlm_allocate_msg(allocation); if (!msg) return NULL; @@ -1267,10 +1272,12 @@ static struct dlm_msg *dlm_lowcomms_new_msg_con(struct connection *con, int len, e = new_wq_entry(con, len, ppc, cb, data); if (!e) { - kfree(msg); + dlm_free_msg(msg); return NULL; } + msg->retransmit = false; + msg->orig_msg = NULL; msg->ppc = *ppc; msg->len = len; msg->entry = e; diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h index 6c8f4ce457f05..29369feea9916 100644 --- a/fs/dlm/lowcomms.h +++ b/fs/dlm/lowcomms.h @@ -48,6 +48,7 @@ int dlm_lowcomms_nodes_set_mark(int nodeid, unsigned int mark); int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len); void dlm_midcomms_receive_done(int nodeid); struct kmem_cache *dlm_lowcomms_writequeue_cache_create(void); +struct kmem_cache *dlm_lowcomms_msg_cache_create(void); #endif /* __LOWCOMMS_DOT_H__ */ diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c index 94af986e83c6d..ce35c3c19aeb5 100644 --- a/fs/dlm/memory.c +++ b/fs/dlm/memory.c @@ -17,6 +17,7 @@ static struct kmem_cache *writequeue_cache; static struct kmem_cache *mhandle_cache; +static struct kmem_cache *msg_cache; static struct kmem_cache *lkb_cache; static struct kmem_cache *rsb_cache; @@ -36,6 +37,10 @@ int __init dlm_memory_init(void) if (!lkb_cache) goto lkb; + msg_cache = dlm_lowcomms_msg_cache_create(); + if (!msg_cache) + goto msg; + rsb_cache = kmem_cache_create("dlm_rsb", sizeof(struct dlm_rsb), __alignof__(struct dlm_rsb), 0, NULL); if (!rsb_cache) @@ -44,6 +49,8 @@ int __init dlm_memory_init(void) return 0; rsb: + kmem_cache_destroy(msg_cache); +msg: kmem_cache_destroy(lkb_cache); lkb: kmem_cache_destroy(mhandle_cache); @@ -57,6 +64,7 @@ void dlm_memory_exit(void) { kmem_cache_destroy(writequeue_cache); kmem_cache_destroy(mhandle_cache); + kmem_cache_destroy(msg_cache); kmem_cache_destroy(lkb_cache); kmem_cache_destroy(rsb_cache); } @@ -129,3 +137,13 @@ void dlm_free_writequeue(struct writequeue_entry *writequeue) { kmem_cache_free(writequeue_cache, writequeue); } + +struct dlm_msg *dlm_allocate_msg(gfp_t allocation) +{ + return kmem_cache_alloc(msg_cache, allocation); +} + +void dlm_free_msg(struct dlm_msg *msg) +{ + kmem_cache_free(msg_cache, msg); +} diff --git a/fs/dlm/memory.h b/fs/dlm/memory.h index 854269eacd445..7bd3f1a391ca7 100644 --- a/fs/dlm/memory.h +++ b/fs/dlm/memory.h @@ -24,6 +24,8 @@ struct dlm_mhandle *dlm_allocate_mhandle(void); void dlm_free_mhandle(struct dlm_mhandle *mhandle); struct writequeue_entry *dlm_allocate_writequeue(void); void dlm_free_writequeue(struct writequeue_entry *writequeue); +struct dlm_msg *dlm_allocate_msg(gfp_t allocation); +void dlm_free_msg(struct dlm_msg *msg); #endif /* __MEMORY_DOT_H__ */ From feae43f8aa88309224b27bbe3a59fcb9aefab6f5 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Tue, 4 Jan 2022 09:09:47 -0500 Subject: [PATCH 32/32] fs: dlm: print cluster addr if non-cluster node connects This patch prints the cluster node address if a non-cluster node (according to the dlm config setting) tries to connect. The current hexdump call will print in a different loglevel and only available if dynamic debug is enabled. Additional we using the ip address format strings to print an IETF ip4/6 string represenation. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 23a1ff6907252..e284d696c1fdc 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1028,10 +1028,28 @@ static int accept_from_sock(struct listen_connection *con) /* Get the new node's NODEID */ make_sockaddr(&peeraddr, 0, &len); if (addr_to_nodeid(&peeraddr, &nodeid, &mark)) { - unsigned char *b=(unsigned char *)&peeraddr; - log_print("connect from non cluster node"); - print_hex_dump_bytes("ss: ", DUMP_PREFIX_NONE, - b, sizeof(struct sockaddr_storage)); + switch (peeraddr.ss_family) { + case AF_INET: { + struct sockaddr_in *sin = (struct sockaddr_in *)&peeraddr; + + log_print("connect from non cluster IPv4 node %pI4", + &sin->sin_addr); + break; + } +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: { + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&peeraddr; + + log_print("connect from non cluster IPv6 node %pI6c", + &sin6->sin6_addr); + break; + } +#endif + default: + log_print("invalid family from non cluster node"); + break; + } + sock_release(newsock); return -1; }