Skip to content

Commit

Permalink
drbd: improve network timeout detection
Browse files Browse the repository at this point in the history
Don't blame the peer for being unresponsive,
if we did not even ask the question yet.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
  • Loading branch information
Lars Ellenberg authored and Jens Axboe committed Nov 25, 2015
1 parent 142207f commit 84d34f2
Show file tree
Hide file tree
Showing 3 changed files with 100 additions and 27 deletions.
2 changes: 2 additions & 0 deletions drivers/block/drbd/drbd_int.h
Original file line number Diff line number Diff line change
Expand Up @@ -773,6 +773,8 @@ struct drbd_connection {
struct drbd_thread_timing_details r_timing_details[DRBD_THREAD_DETAILS_HIST];

struct {
unsigned long last_sent_barrier_jif;

/* whether this sender thread
* has processed a single write yet. */
bool seen_any_write_yet;
Expand Down
123 changes: 96 additions & 27 deletions drivers/block/drbd/drbd_req.c
Original file line number Diff line number Diff line change
Expand Up @@ -1531,6 +1531,78 @@ blk_qc_t drbd_make_request(struct request_queue *q, struct bio *bio)
return BLK_QC_T_NONE;
}

static bool net_timeout_reached(struct drbd_request *net_req,
struct drbd_connection *connection,
unsigned long now, unsigned long ent,
unsigned int ko_count, unsigned int timeout)
{
struct drbd_device *device = net_req->device;

if (!time_after(now, net_req->pre_send_jif + ent))
return false;

if (time_in_range(now, connection->last_reconnect_jif, connection->last_reconnect_jif + ent))
return false;

if (net_req->rq_state & RQ_NET_PENDING) {
drbd_warn(device, "Remote failed to finish a request within %ums > ko-count (%u) * timeout (%u * 0.1s)\n",
jiffies_to_msecs(now - net_req->pre_send_jif), ko_count, timeout);
return true;
}

/* We received an ACK already (or are using protocol A),
* but are waiting for the epoch closing barrier ack.
* Check if we sent the barrier already. We should not blame the peer
* for being unresponsive, if we did not even ask it yet. */
if (net_req->epoch == connection->send.current_epoch_nr) {
drbd_warn(device,
"We did not send a P_BARRIER for %ums > ko-count (%u) * timeout (%u * 0.1s); drbd kernel thread blocked?\n",
jiffies_to_msecs(now - net_req->pre_send_jif), ko_count, timeout);
return false;
}

/* Worst case: we may have been blocked for whatever reason, then
* suddenly are able to send a lot of requests (and epoch separating
* barriers) in quick succession.
* The timestamp of the net_req may be much too old and not correspond
* to the sending time of the relevant unack'ed barrier packet, so
* would trigger a spurious timeout. The latest barrier packet may
* have a too recent timestamp to trigger the timeout, potentially miss
* a timeout. Right now we don't have a place to conveniently store
* these timestamps.
* But in this particular situation, the application requests are still
* completed to upper layers, DRBD should still "feel" responsive.
* No need yet to kill this connection, it may still recover.
* If not, eventually we will have queued enough into the network for
* us to block. From that point of view, the timestamp of the last sent
* barrier packet is relevant enough.
*/
if (time_after(now, connection->send.last_sent_barrier_jif + ent)) {
drbd_warn(device, "Remote failed to answer a P_BARRIER (sent at %lu jif; now=%lu jif) within %ums > ko-count (%u) * timeout (%u * 0.1s)\n",
connection->send.last_sent_barrier_jif, now,
jiffies_to_msecs(now - connection->send.last_sent_barrier_jif), ko_count, timeout);
return true;
}
return false;
}

/* A request is considered timed out, if
* - we have some effective timeout from the configuration,
* with some state restrictions applied,
* - the oldest request is waiting for a response from the network
* resp. the local disk,
* - the oldest request is in fact older than the effective timeout,
* - the connection was established (resp. disk was attached)
* for longer than the timeout already.
* Note that for 32bit jiffies and very stable connections/disks,
* we may have a wrap around, which is catched by
* !time_in_range(now, last_..._jif, last_..._jif + timeout).
*
* Side effect: once per 32bit wrap-around interval, which means every
* ~198 days with 250 HZ, we have a window where the timeout would need
* to expire twice (worst case) to become effective. Good enough.
*/

void request_timer_fn(unsigned long data)
{
struct drbd_device *device = (struct drbd_device *) data;
Expand All @@ -1540,18 +1612,23 @@ void request_timer_fn(unsigned long data)
unsigned long oldest_submit_jif;
unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */
unsigned long now;
unsigned int ko_count = 0, timeout = 0;

rcu_read_lock();
nc = rcu_dereference(connection->net_conf);
if (nc && device->state.conn >= C_WF_REPORT_PARAMS)
ent = nc->timeout * HZ/10 * nc->ko_count;
if (nc && device->state.conn >= C_WF_REPORT_PARAMS) {
ko_count = nc->ko_count;
timeout = nc->timeout;
}

if (get_ldev(device)) { /* implicit state.disk >= D_INCONSISTENT */
dt = rcu_dereference(device->ldev->disk_conf)->disk_timeout * HZ / 10;
put_ldev(device);
}
rcu_read_unlock();


ent = timeout * HZ/10 * ko_count;
et = min_not_zero(dt, ent);

if (!et)
Expand All @@ -1563,11 +1640,22 @@ void request_timer_fn(unsigned long data)
spin_lock_irq(&device->resource->req_lock);
req_read = list_first_entry_or_null(&device->pending_completion[0], struct drbd_request, req_pending_local);
req_write = list_first_entry_or_null(&device->pending_completion[1], struct drbd_request, req_pending_local);
req_peer = connection->req_not_net_done;

/* maybe the oldest request waiting for the peer is in fact still
* blocking in tcp sendmsg */
if (!req_peer && connection->req_next && connection->req_next->pre_send_jif)
req_peer = connection->req_next;
* blocking in tcp sendmsg. That's ok, though, that's handled via the
* socket send timeout, requesting a ping, and bumping ko-count in
* we_should_drop_the_connection().
*/

/* check the oldest request we did successfully sent,
* but which is still waiting for an ACK. */
req_peer = connection->req_ack_pending;

/* if we don't have such request (e.g. protocoll A)
* check the oldest requests which is still waiting on its epoch
* closing barrier ack. */
if (!req_peer)
req_peer = connection->req_not_net_done;

/* evaluate the oldest peer request only in one timer! */
if (req_peer && req_peer->device != device)
Expand All @@ -1584,28 +1672,9 @@ void request_timer_fn(unsigned long data)
: req_write ? req_write->pre_submit_jif
: req_read ? req_read->pre_submit_jif : now;

/* The request is considered timed out, if
* - we have some effective timeout from the configuration,
* with above state restrictions applied,
* - the oldest request is waiting for a response from the network
* resp. the local disk,
* - the oldest request is in fact older than the effective timeout,
* - the connection was established (resp. disk was attached)
* for longer than the timeout already.
* Note that for 32bit jiffies and very stable connections/disks,
* we may have a wrap around, which is catched by
* !time_in_range(now, last_..._jif, last_..._jif + timeout).
*
* Side effect: once per 32bit wrap-around interval, which means every
* ~198 days with 250 HZ, we have a window where the timeout would need
* to expire twice (worst case) to become effective. Good enough.
*/
if (ent && req_peer &&
time_after(now, req_peer->pre_send_jif + ent) &&
!time_in_range(now, connection->last_reconnect_jif, connection->last_reconnect_jif + ent)) {
drbd_warn(device, "Remote failed to finish a request within ko-count * timeout\n");
if (ent && req_peer && net_timeout_reached(req_peer, connection, now, ent, ko_count, timeout))
_conn_request_state(connection, NS(conn, C_TIMEOUT), CS_VERBOSE | CS_HARD);
}

if (dt && oldest_submit_jif != now &&
time_after(now, oldest_submit_jif + dt) &&
!time_in_range(now, device->last_reattach_jif, device->last_reattach_jif + dt)) {
Expand Down
2 changes: 2 additions & 0 deletions drivers/block/drbd/drbd_worker.c
Original file line number Diff line number Diff line change
Expand Up @@ -1290,6 +1290,7 @@ static int drbd_send_barrier(struct drbd_connection *connection)
p->barrier = connection->send.current_epoch_nr;
p->pad = 0;
connection->send.current_epoch_writes = 0;
connection->send.last_sent_barrier_jif = jiffies;

return conn_send_command(connection, sock, P_BARRIER, sizeof(*p), NULL, 0);
}
Expand All @@ -1314,6 +1315,7 @@ static void re_init_if_first_write(struct drbd_connection *connection, unsigned
connection->send.seen_any_write_yet = true;
connection->send.current_epoch_nr = epoch;
connection->send.current_epoch_writes = 0;
connection->send.last_sent_barrier_jif = jiffies;
}
}

Expand Down

0 comments on commit 84d34f2

Please sign in to comment.