From 08dd3b4ed4955cd0605fce83bf7324288f10d97f Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Mon, 3 Sep 2012 15:48:21 +0200 Subject: [PATCH] --- yaml --- r: 345691 b: refs/heads/master c: 7fb907c15fb8d0e10e72c8566a13f6defab3f484 h: refs/heads/master i: 345689: 3f381e2b952e26302bd0fd229ad630b3ad8bd1a3 345687: f858a56ad879631589e656a464db5234d5c4148c v: v3 --- [refs] | 2 +- trunk/drivers/block/drbd/drbd_worker.c | 36 ++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/[refs] b/[refs] index 60ea616a282d..9ee1e1ceb8d4 100644 --- a/[refs] +++ b/[refs] @@ -1,2 +1,2 @@ --- -refs/heads/master: dbd0820c6f7b7db9a97d63ea379fc174a63ddbca +refs/heads/master: 7fb907c15fb8d0e10e72c8566a13f6defab3f484 diff --git a/trunk/drivers/block/drbd/drbd_worker.c b/trunk/drivers/block/drbd/drbd_worker.c index 1352455dd7dd..66dcb2d7eada 100644 --- a/trunk/drivers/block/drbd/drbd_worker.c +++ b/trunk/drivers/block/drbd/drbd_worker.c @@ -227,6 +227,42 @@ void drbd_endio_pri(struct bio *bio, int error) error = -EIO; } + /* If this request was aborted locally before, + * but now was completed "successfully", + * chances are that this caused arbitrary data corruption. + * + * "aborting" requests, or force-detaching the disk, is intended for + * completely blocked/hung local backing devices which do no longer + * complete requests at all, not even do error completions. In this + * situation, usually a hard-reset and failover is the only way out. + * + * By "aborting", basically faking a local error-completion, + * we allow for a more graceful swichover by cleanly migrating services. + * Still the affected node has to be rebooted "soon". + * + * By completing these requests, we allow the upper layers to re-use + * the associated data pages. + * + * If later the local backing device "recovers", and now DMAs some data + * from disk into the original request pages, in the best case it will + * just put random data into unused pages; but typically it will corrupt + * meanwhile completely unrelated data, causing all sorts of damage. + * + * Which means delayed successful completion, + * especially for READ requests, + * is a reason to panic(). + * + * We assume that a delayed *error* completion is OK, + * though we still will complain noisily about it. + */ + if (unlikely(req->rq_state & RQ_LOCAL_ABORTED)) { + if (__ratelimit(&drbd_ratelimit_state)) + dev_emerg(DEV, "delayed completion of aborted local request; disk-timeout may be too aggressive\n"); + + if (!error) + panic("possible random memory corruption caused by delayed completion of aborted local request\n"); + } + /* to avoid recursion in __req_mod */ if (unlikely(error)) { what = (bio_data_dir(bio) == WRITE)