Skip to content

Commit

Permalink
Fix read-balancing during node failure
Browse files Browse the repository at this point in the history
During a node failure, We need to suspend read balancing so that the
reads are directed to the first device and stale data is not read.
Suspending writes is not required because these would be recorded and
synced eventually.

A new flag MD_CLUSTER_SUSPEND_READ_BALANCING is set in recover_prep().
area_resyncing() will respond true for the entire devices if this
flag is set and the request type is READ. The flag is cleared
in recover_done().

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Reported-By: David Teigland <teigland@redhat.com>
Signed-off-by: NeilBrown <neilb@suse.com>
  • Loading branch information
Goldwyn Rodrigues authored and NeilBrown committed Jul 24, 2015
1 parent 33e38ac commit 90382ed
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 5 deletions.
12 changes: 11 additions & 1 deletion drivers/md/md-cluster.c
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ struct resync_info {

/* md_cluster_info flags */
#define MD_CLUSTER_WAITING_FOR_NEWDISK 1
#define MD_CLUSTER_SUSPEND_READ_BALANCING 2


struct md_cluster_info {
Expand Down Expand Up @@ -275,6 +276,9 @@ static void recover_bitmaps(struct md_thread *thread)

static void recover_prep(void *arg)
{
struct mddev *mddev = arg;
struct md_cluster_info *cinfo = mddev->cluster_info;
set_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state);
}

static void recover_slot(void *arg, struct dlm_slot *slot)
Expand Down Expand Up @@ -307,6 +311,7 @@ static void recover_done(void *arg, struct dlm_slot *slots,

cinfo->slot_number = our_slot;
complete(&cinfo->completion);
clear_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state);
}

static const struct dlm_lockspace_ops md_ls_ops = {
Expand Down Expand Up @@ -816,12 +821,17 @@ static void resync_finish(struct mddev *mddev)
resync_send(mddev, RESYNCING, 0, 0);
}

static int area_resyncing(struct mddev *mddev, sector_t lo, sector_t hi)
static int area_resyncing(struct mddev *mddev, int direction,
sector_t lo, sector_t hi)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
int ret = 0;
struct suspend_info *s;

if ((direction == READ) &&
test_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state))
return 1;

spin_lock_irq(&cinfo->suspend_lock);
if (list_empty(&cinfo->suspend_list))
goto out;
Expand Down
2 changes: 1 addition & 1 deletion drivers/md/md-cluster.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ struct md_cluster_operations {
int (*metadata_update_start)(struct mddev *mddev);
int (*metadata_update_finish)(struct mddev *mddev);
int (*metadata_update_cancel)(struct mddev *mddev);
int (*area_resyncing)(struct mddev *mddev, sector_t lo, sector_t hi);
int (*area_resyncing)(struct mddev *mddev, int direction, sector_t lo, sector_t hi);
int (*add_new_disk_start)(struct mddev *mddev, struct md_rdev *rdev);
int (*add_new_disk_finish)(struct mddev *mddev);
int (*new_disk_ack)(struct mddev *mddev, bool ack);
Expand Down
7 changes: 4 additions & 3 deletions drivers/md/raid1.c
Original file line number Diff line number Diff line change
Expand Up @@ -541,7 +541,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect

if ((conf->mddev->recovery_cp < this_sector + sectors) ||
(mddev_is_clustered(conf->mddev) &&
md_cluster_ops->area_resyncing(conf->mddev, this_sector,
md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector,
this_sector + sectors)))
choose_first = 1;
else
Expand Down Expand Up @@ -1111,7 +1111,8 @@ static void make_request(struct mddev *mddev, struct bio * bio)
((bio_end_sector(bio) > mddev->suspend_lo &&
bio->bi_iter.bi_sector < mddev->suspend_hi) ||
(mddev_is_clustered(mddev) &&
md_cluster_ops->area_resyncing(mddev, bio->bi_iter.bi_sector, bio_end_sector(bio))))) {
md_cluster_ops->area_resyncing(mddev, WRITE,
bio->bi_iter.bi_sector, bio_end_sector(bio))))) {
/* As the suspend_* range is controlled by
* userspace, we want an interruptible
* wait.
Expand All @@ -1124,7 +1125,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
if (bio_end_sector(bio) <= mddev->suspend_lo ||
bio->bi_iter.bi_sector >= mddev->suspend_hi ||
(mddev_is_clustered(mddev) &&
!md_cluster_ops->area_resyncing(mddev,
!md_cluster_ops->area_resyncing(mddev, WRITE,
bio->bi_iter.bi_sector, bio_end_sector(bio))))
break;
schedule();
Expand Down

0 comments on commit 90382ed

Please sign in to comment.