Skip to content

Commit

Permalink
drbd: fix for spurious fullsync (uuids rotated too fast)
Browse files Browse the repository at this point in the history
If it was an "empty" resync, the SyncSource may have already "finished"
the resync and rotated the UUIDs, before noticing the connection loss
(and generating a new uuid, if Primary, rotating again), while the
SyncTarget did not change its uuids at all, or only got to the previous
sync-uuid.
This would then again lead to a full sync on next handshake
(see also Bug #251).

Fix:
Use explicit resync finished notification even for empty resyncs,
do not finish an empty resync implicitly on the SyncSource.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
  • Loading branch information
Lars Ellenberg authored and Philipp Reisner committed Oct 14, 2010
1 parent e9ef7bb commit af85e8e
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 11 deletions.
5 changes: 5 additions & 0 deletions drivers/block/drbd/drbd_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -1426,6 +1426,11 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
(os.user_isp && !ns.user_isp))
resume_next_sg(mdev);

/* sync target done with resync. Explicitly notify peer, even though
* it should (at least for non-empty resyncs) already know itself. */
if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
drbd_send_state(mdev);

/* free tl_hash if we Got thawed and are C_STANDALONE */
if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash)
drbd_free_tl_hash(mdev);
Expand Down
42 changes: 31 additions & 11 deletions drivers/block/drbd/drbd_worker.c
Original file line number Diff line number Diff line change
Expand Up @@ -522,6 +522,12 @@ int w_make_resync_request(struct drbd_conf *mdev,
dev_err(DEV, "%s in w_make_resync_request\n",
drbd_conn_str(mdev->state.conn));

if (mdev->rs_total == 0) {
/* empty resync? */
drbd_resync_finished(mdev);
return 1;
}

if (!get_ldev(mdev)) {
/* Since we only need to access mdev->rsync a
get_ldev_if_state(mdev,D_FAILED) would be sufficient, but
Expand Down Expand Up @@ -768,6 +774,14 @@ static int w_resync_finished(struct drbd_conf *mdev, struct drbd_work *w, int ca
return 1;
}

static void ping_peer(struct drbd_conf *mdev)
{
clear_bit(GOT_PING_ACK, &mdev->flags);
request_ping(mdev);
wait_event(mdev->misc_wait,
test_bit(GOT_PING_ACK, &mdev->flags) || mdev->state.conn < C_CONNECTED);
}

int drbd_resync_finished(struct drbd_conf *mdev)
{
unsigned long db, dt, dbdt;
Expand Down Expand Up @@ -807,6 +821,8 @@ int drbd_resync_finished(struct drbd_conf *mdev)
if (!get_ldev(mdev))
goto out;

ping_peer(mdev);

spin_lock_irq(&mdev->req_lock);
os = mdev->state;

Expand Down Expand Up @@ -1420,14 +1436,6 @@ int drbd_alter_sa(struct drbd_conf *mdev, int na)
return retcode;
}

static void ping_peer(struct drbd_conf *mdev)
{
clear_bit(GOT_PING_ACK, &mdev->flags);
request_ping(mdev);
wait_event(mdev->misc_wait,
test_bit(GOT_PING_ACK, &mdev->flags) || mdev->state.conn < C_CONNECTED);
}

/**
* drbd_start_resync() - Start the resync process
* @mdev: DRBD device.
Expand Down Expand Up @@ -1527,9 +1535,21 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
(unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10),
(unsigned long) mdev->rs_total);

if (mdev->rs_total == 0) {
/* Peer still reachable? Beware of failing before-resync-target handlers! */
ping_peer(mdev);
if (mdev->agreed_pro_version < 95 && mdev->rs_total == 0) {
/* This still has a race (about when exactly the peers
* detect connection loss) that can lead to a full sync
* on next handshake. In 8.3.9 we fixed this with explicit
* resync-finished notifications, but the fix
* introduces a protocol change. Sleeping for some
* time longer than the ping interval + timeout on the
* SyncSource, to give the SyncTarget the chance to
* detect connection loss, then waiting for a ping
* response (implicit in drbd_resync_finished) reduces
* the race considerably, but does not solve it. */
if (side == C_SYNC_SOURCE)
schedule_timeout_interruptible(
mdev->net_conf->ping_int * HZ +
mdev->net_conf->ping_timeo*HZ/9);
drbd_resync_finished(mdev);
}

Expand Down

0 comments on commit af85e8e

Please sign in to comment.