Skip to content

Commit

Permalink
drbd: Improve how conflicting writes are handled
Browse files Browse the repository at this point in the history
The previous algorithm for dealing with overlapping concurrent writes
was generating unnecessary warnings for scenarios which could be
legitimate, and did not always handle partially overlapping requests
correctly.  Improve it algorithm as follows:

* While local or remote write requests are in progress, conflicting new
  local write requests will be delayed (commit 82172f7).

* When a conflict between a local and remote write request is detected,
  the node with the discard flag decides how to resolve the conflict: It
  will ask its peer to discard conflicting requests which are fully
  contained in the local request and retry requests which overlap only
  partially.  This involves a protocol change.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
  • Loading branch information
Andreas Gruenbacher authored and Philipp Reisner committed Oct 14, 2011
1 parent 71b1c1e commit 7be8da0
Show file tree
Hide file tree
Showing 5 changed files with 351 additions and 191 deletions.
15 changes: 11 additions & 4 deletions drivers/block/drbd/drbd_int.h
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ enum drbd_packet {
P_RECV_ACK = 0x15, /* Used in protocol B */
P_WRITE_ACK = 0x16, /* Used in protocol C */
P_RS_WRITE_ACK = 0x17, /* Is a P_WRITE_ACK, additionally call set_in_sync(). */
P_DISCARD_ACK = 0x18, /* Used in proto C, two-primaries conflict detection */
P_DISCARD_WRITE = 0x18, /* Used in proto C, two-primaries conflict detection */
P_NEG_ACK = 0x19, /* Sent if local disk is unusable */
P_NEG_DREPLY = 0x1a, /* Local disk is broken... */
P_NEG_RS_DREPLY = 0x1b, /* Local disk is broken... */
Expand All @@ -223,8 +223,9 @@ enum drbd_packet {
P_RS_CANCEL = 0x29, /* meta: Used to cancel RS_DATA_REQUEST packet by SyncSource */
P_CONN_ST_CHG_REQ = 0x2a, /* data sock: Connection wide state request */
P_CONN_ST_CHG_REPLY = 0x2b, /* meta sock: Connection side state req reply */
P_RETRY_WRITE = 0x2c, /* Protocol C: retry conflicting write request */

P_MAX_CMD = 0x2c,
P_MAX_CMD = 0x2d,
P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */
P_MAX_OPT_CMD = 0x101,

Expand Down Expand Up @@ -350,7 +351,7 @@ struct p_data {
* commands which share a struct:
* p_block_ack:
* P_RECV_ACK (proto B), P_WRITE_ACK (proto C),
* P_DISCARD_ACK (proto C, two-primaries conflict detection)
* P_DISCARD_WRITE (proto C, two-primaries conflict detection)
* p_block_req:
* P_DATA_REQUEST, P_RS_DATA_REQUEST
*/
Expand All @@ -362,7 +363,6 @@ struct p_block_ack {
u32 seq_num;
} __packed;


struct p_block_req {
struct p_header head;
u64 sector;
Expand Down Expand Up @@ -655,6 +655,8 @@ struct drbd_work {

#include "drbd_interval.h"

extern int drbd_wait_misc(struct drbd_conf *, struct drbd_interval *);

struct drbd_request {
struct drbd_work w;

Expand Down Expand Up @@ -752,12 +754,16 @@ enum {

/* This ee has a pointer to a digest instead of a block id */
__EE_HAS_DIGEST,

/* Conflicting local requests need to be restarted after this request */
__EE_RESTART_REQUESTS,
};
#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
#define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC)
#define EE_RESUBMITTED (1<<__EE_RESUBMITTED)
#define EE_WAS_ERROR (1<<__EE_WAS_ERROR)
#define EE_HAS_DIGEST (1<<__EE_HAS_DIGEST)
#define EE_RESTART_REQUESTS (1<<__EE_RESTART_REQUESTS)

/* flag bits per mdev */
enum {
Expand Down Expand Up @@ -1478,6 +1484,7 @@ extern void drbd_free_tconn(struct drbd_tconn *tconn);
extern int proc_details;

/* drbd_req */
extern int __drbd_make_request(struct drbd_conf *, struct bio *, unsigned long);
extern int drbd_make_request(struct request_queue *q, struct bio *bio);
extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req);
extern int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec);
Expand Down
35 changes: 34 additions & 1 deletion drivers/block/drbd/drbd_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -3003,7 +3003,7 @@ const char *cmdname(enum drbd_packet cmd)
[P_RECV_ACK] = "RecvAck",
[P_WRITE_ACK] = "WriteAck",
[P_RS_WRITE_ACK] = "RSWriteAck",
[P_DISCARD_ACK] = "DiscardAck",
[P_DISCARD_WRITE] = "DiscardWrite",
[P_NEG_ACK] = "NegAck",
[P_NEG_DREPLY] = "NegDReply",
[P_NEG_RS_DREPLY] = "NegRSDReply",
Expand All @@ -3018,6 +3018,7 @@ const char *cmdname(enum drbd_packet cmd)
[P_COMPRESSED_BITMAP] = "CBitmap",
[P_DELAY_PROBE] = "DelayProbe",
[P_OUT_OF_SYNC] = "OutOfSync",
[P_RETRY_WRITE] = "RetryWrite",
[P_MAX_CMD] = NULL,
};

Expand All @@ -3032,6 +3033,38 @@ const char *cmdname(enum drbd_packet cmd)
return cmdnames[cmd];
}

/**
* drbd_wait_misc - wait for a request to make progress
* @mdev: device associated with the request
* @i: the struct drbd_interval embedded in struct drbd_request or
* struct drbd_peer_request
*/
int drbd_wait_misc(struct drbd_conf *mdev, struct drbd_interval *i)
{
struct net_conf *net_conf = mdev->tconn->net_conf;
DEFINE_WAIT(wait);
long timeout;

if (!net_conf)
return -ETIMEDOUT;
timeout = MAX_SCHEDULE_TIMEOUT;
if (net_conf->ko_count)
timeout = net_conf->timeout * HZ / 10 * net_conf->ko_count;

/* Indicate to wake up mdev->misc_wait on progress. */
i->waiting = true;
prepare_to_wait(&mdev->misc_wait, &wait, TASK_INTERRUPTIBLE);
spin_unlock_irq(&mdev->tconn->req_lock);
timeout = schedule_timeout(timeout);
finish_wait(&mdev->misc_wait, &wait);
spin_lock_irq(&mdev->tconn->req_lock);
if (!timeout || mdev->state.conn < C_CONNECTED)
return -ETIMEDOUT;
if (signal_pending(current))
return -ERESTARTSYS;
return 0;
}

#ifdef CONFIG_DRBD_FAULT_INJECTION
/* Fault insertion support including random number generator shamelessly
* stolen from kernel/rcutorture.c */
Expand Down
Loading

0 comments on commit 7be8da0

Please sign in to comment.