Skip to content

Commit

Permalink
[PATCH] ocfs2: add dlm_wait_for_node_death
Browse files Browse the repository at this point in the history
* add dlm_wait_for_node_death function to be used after receiving a network
  error.  this will wait for the given timeout to allow the heartbeat
  callbacks to update the domain map.  without this, some paths may spin
  and consume enough cpu that the heartbeat gets starved and never updates.

Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
  • Loading branch information
Kurt Hackel authored and Mark Fasheh committed Feb 16, 2006
1 parent e2b5e45 commit 44465a7
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 1 deletion.
4 changes: 4 additions & 0 deletions fs/ocfs2/dlm/dlmcommon.h
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,9 @@ static inline void __dlm_set_joining_node(struct dlm_ctxt *dlm,
#define DLM_LOCK_RES_IN_PROGRESS 0x00000010
#define DLM_LOCK_RES_MIGRATING 0x00000020

/* max milliseconds to wait to sync up a network failure with a node death */
#define DLM_NODE_DEATH_WAIT_MAX (5 * 1000)

#define DLM_PURGE_INTERVAL_MS (8 * 1000)

struct dlm_lock_resource
Expand Down Expand Up @@ -658,6 +661,7 @@ int dlm_launch_recovery_thread(struct dlm_ctxt *dlm);
void dlm_complete_recovery_thread(struct dlm_ctxt *dlm);
void dlm_wait_for_recovery(struct dlm_ctxt *dlm);
int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node);
int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout);

void dlm_put(struct dlm_ctxt *dlm);
struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm);
Expand Down
5 changes: 5 additions & 0 deletions fs/ocfs2/dlm/dlmconvert.c
Original file line number Diff line number Diff line change
Expand Up @@ -392,6 +392,11 @@ static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
} else {
mlog_errno(tmpret);
if (dlm_is_host_down(tmpret)) {
/* instead of logging the same network error over
* and over, sleep here and wait for the heartbeat
* to notice the node is dead. times out after 5s. */
dlm_wait_for_node_death(dlm, res->owner,
DLM_NODE_DEATH_WAIT_MAX);
ret = DLM_RECOVERING;
mlog(0, "node %u died so returning DLM_RECOVERING "
"from convert message!\n", res->owner);
Expand Down
14 changes: 13 additions & 1 deletion fs/ocfs2/dlm/dlmlock.c
Original file line number Diff line number Diff line change
Expand Up @@ -646,7 +646,19 @@ enum dlm_status dlmlock(struct dlm_ctxt *dlm, int mode,
mlog(0, "retrying lock with migration/"
"recovery/in progress\n");
msleep(100);
dlm_wait_for_recovery(dlm);
/* no waiting for dlm_reco_thread */
if (recovery) {
if (status == DLM_RECOVERING) {
mlog(0, "%s: got RECOVERING "
"for $REOCVERY lock, master "
"was %u\n", dlm->name,
res->owner);
dlm_wait_for_node_death(dlm, res->owner,
DLM_NODE_DEATH_WAIT_MAX);
}
} else {
dlm_wait_for_recovery(dlm);
}
goto retry_lock;
}

Expand Down
18 changes: 18 additions & 0 deletions fs/ocfs2/dlm/dlmrecovery.c
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,24 @@ int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node)
return dead;
}

int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout)
{
if (timeout) {
mlog(ML_NOTICE, "%s: waiting %dms for notification of "
"death of node %u\n", dlm->name, timeout, node);
wait_event_timeout(dlm->dlm_reco_thread_wq,
dlm_is_node_dead(dlm, node),
msecs_to_jiffies(timeout));
} else {
mlog(ML_NOTICE, "%s: waiting indefinitely for notification "
"of death of node %u\n", dlm->name, node);
wait_event(dlm->dlm_reco_thread_wq,
dlm_is_node_dead(dlm, node));
}
/* for now, return 0 */
return 0;
}

/* callers of the top-level api calls (dlmlock/dlmunlock) should
* block on the dlm->reco.event when recovery is in progress.
* the dlm recovery thread will set this state when it begins
Expand Down

0 comments on commit 44465a7

Please sign in to comment.