Skip to content

Commit

Permalink
Merge tag 'md/4.3' of git://neil.brown.name/md
Browse files Browse the repository at this point in the history
Pull md updates from Neil Brown:

 - an assortment of little fixes, several for minor races only likely to
   be hit during testing

 - further cluster-md-raid1 development, not ready for real use yet.

 - new RAID6 syndrome code for ARM NEON

 - fix a race where a write can return before failure of one device is
   properly recorded in metadata, so an immediate crash might result in
   that write being lost.

* tag 'md/4.3' of git://neil.brown.name/md: (33 commits)
  md/raid5: ensure device failure recorded before write request returns.
  md/raid5: use bio_list for the list of bios to return.
  md/raid10: ensure device failure recorded before write request returns.
  md/raid1: ensure device failure recorded before write request returns.
  md-cluster: remove inappropriate try_module_get from join()
  md: extend spinlock protection in register_md_cluster_operations
  md-cluster: Read the disk bitmap sb and check if it needs recovery
  md-cluster: only call complete(&cinfo->completion) when node join cluster
  md-cluster: add missed lockres_free
  md-cluster: remove the unused sb_lock
  md-cluster: init suspend_list and suspend_lock early in join
  md-cluster: add the error check if failed to get dlm lock
  md-cluster: init completion within lockres_init
  md-cluster: fix deadlock issue on message lock
  md-cluster: transfer the resync ownership to another node
  md-cluster: split recover_slot for future code reuse
  md-cluster: use %pU to print UUIDs
  md: setup safemode_timer before it's being used
  md/raid5: handle possible race as reshape completes.
  md: sync sync_completed has correct value as recovery finishes.
  ...
  • Loading branch information
Linus Torvalds committed Sep 6, 2015
2 parents 1744771 + e89c6fd commit 2a013e3
Show file tree
Hide file tree
Showing 12 changed files with 433 additions and 193 deletions.
4 changes: 2 additions & 2 deletions Documentation/md-cluster.txt
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ The algorithm is:
this message inappropriate or redundant.

3. sender write LVB.
sender down-convert MESSAGE from EX to CR
sender down-convert MESSAGE from EX to CW
sender try to get EX of ACK
[ wait until all receiver has *processed* the MESSAGE ]

Expand All @@ -112,7 +112,7 @@ The algorithm is:
sender down-convert ACK from EX to CR
sender release MESSAGE
sender release TOKEN
receiver upconvert to EX of MESSAGE
receiver upconvert to PR of MESSAGE
receiver get CR of ACK
receiver release MESSAGE

Expand Down
159 changes: 105 additions & 54 deletions drivers/md/md-cluster.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,14 +45,14 @@ struct resync_info {
/* md_cluster_info flags */
#define MD_CLUSTER_WAITING_FOR_NEWDISK 1
#define MD_CLUSTER_SUSPEND_READ_BALANCING 2
#define MD_CLUSTER_BEGIN_JOIN_CLUSTER 3


struct md_cluster_info {
/* dlm lock space and resources for clustered raid. */
dlm_lockspace_t *lockspace;
int slot_number;
struct completion completion;
struct dlm_lock_resource *sb_lock;
struct mutex sb_mutex;
struct dlm_lock_resource *bitmap_lockres;
struct list_head suspend_list;
Expand All @@ -75,6 +75,7 @@ enum msg_type {
NEWDISK,
REMOVE,
RE_ADD,
BITMAP_NEEDS_SYNC,
};

struct cluster_msg {
Expand All @@ -99,7 +100,6 @@ static int dlm_lock_sync(struct dlm_lock_resource *res, int mode)
{
int ret = 0;

init_completion(&res->completion);
ret = dlm_lock(res->ls, mode, &res->lksb,
res->flags, res->name, strlen(res->name),
0, sync_ast, res, res->bast);
Expand All @@ -124,6 +124,7 @@ static struct dlm_lock_resource *lockres_init(struct mddev *mddev,
res = kzalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL);
if (!res)
return NULL;
init_completion(&res->completion);
res->ls = cinfo->lockspace;
res->mddev = mddev;
namelen = strlen(name);
Expand Down Expand Up @@ -165,30 +166,31 @@ static struct dlm_lock_resource *lockres_init(struct mddev *mddev,

static void lockres_free(struct dlm_lock_resource *res)
{
int ret;

if (!res)
return;

init_completion(&res->completion);
dlm_unlock(res->ls, res->lksb.sb_lkid, 0, &res->lksb, res);
/* cancel a lock request or a conversion request that is blocked */
res->flags |= DLM_LKF_CANCEL;
retry:
ret = dlm_unlock(res->ls, res->lksb.sb_lkid, 0, &res->lksb, res);
if (unlikely(ret != 0)) {
pr_info("%s: failed to unlock %s return %d\n", __func__, res->name, ret);

/* if a lock conversion is cancelled, then the lock is put
* back to grant queue, need to ensure it is unlocked */
if (ret == -DLM_ECANCEL)
goto retry;
}
res->flags &= ~DLM_LKF_CANCEL;
wait_for_completion(&res->completion);

kfree(res->name);
kfree(res->lksb.sb_lvbptr);
kfree(res);
}

static char *pretty_uuid(char *dest, char *src)
{
int i, len = 0;

for (i = 0; i < 16; i++) {
if (i == 4 || i == 6 || i == 8 || i == 10)
len += sprintf(dest + len, "-");
len += sprintf(dest + len, "%02x", (__u8)src[i]);
}
return dest;
}

static void add_resync_info(struct mddev *mddev, struct dlm_lock_resource *lockres,
sector_t lo, sector_t hi)
{
Expand Down Expand Up @@ -281,16 +283,11 @@ static void recover_prep(void *arg)
set_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state);
}

static void recover_slot(void *arg, struct dlm_slot *slot)
static void __recover_slot(struct mddev *mddev, int slot)
{
struct mddev *mddev = arg;
struct md_cluster_info *cinfo = mddev->cluster_info;

pr_info("md-cluster: %s Node %d/%d down. My slot: %d. Initiating recovery.\n",
mddev->bitmap_info.cluster_name,
slot->nodeid, slot->slot,
cinfo->slot_number);
set_bit(slot->slot - 1, &cinfo->recovery_map);
set_bit(slot, &cinfo->recovery_map);
if (!cinfo->recovery_thread) {
cinfo->recovery_thread = md_register_thread(recover_bitmaps,
mddev, "recover");
Expand All @@ -302,6 +299,20 @@ static void recover_slot(void *arg, struct dlm_slot *slot)
md_wakeup_thread(cinfo->recovery_thread);
}

static void recover_slot(void *arg, struct dlm_slot *slot)
{
struct mddev *mddev = arg;
struct md_cluster_info *cinfo = mddev->cluster_info;

pr_info("md-cluster: %s Node %d/%d down. My slot: %d. Initiating recovery.\n",
mddev->bitmap_info.cluster_name,
slot->nodeid, slot->slot,
cinfo->slot_number);
/* deduct one since dlm slot starts from one while the num of
* cluster-md begins with 0 */
__recover_slot(mddev, slot->slot - 1);
}

static void recover_done(void *arg, struct dlm_slot *slots,
int num_slots, int our_slot,
uint32_t generation)
Expand All @@ -310,10 +321,17 @@ static void recover_done(void *arg, struct dlm_slot *slots,
struct md_cluster_info *cinfo = mddev->cluster_info;

cinfo->slot_number = our_slot;
complete(&cinfo->completion);
/* completion is only need to be complete when node join cluster,
* it doesn't need to run during another node's failure */
if (test_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state)) {
complete(&cinfo->completion);
clear_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state);
}
clear_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state);
}

/* the ops is called when node join the cluster, and do lock recovery
* if node failure occurs */
static const struct dlm_lockspace_ops md_ls_ops = {
.recover_prep = recover_prep,
.recover_slot = recover_slot,
Expand Down Expand Up @@ -388,7 +406,7 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
int len;

len = snprintf(disk_uuid, 64, "DEVICE_UUID=");
pretty_uuid(disk_uuid + len, cmsg->uuid);
sprintf(disk_uuid + len, "%pU", cmsg->uuid);
snprintf(raid_slot, 16, "RAID_DISK=%d", cmsg->raid_slot);
pr_info("%s:%d Sending kobject change with %s and %s\n", __func__, __LINE__, disk_uuid, raid_slot);
init_completion(&cinfo->newdisk_completion);
Expand Down Expand Up @@ -457,6 +475,11 @@ static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
__func__, __LINE__, msg->slot);
process_readd_disk(mddev, msg);
break;
case BITMAP_NEEDS_SYNC:
pr_info("%s: %d Received BITMAP_NEEDS_SYNC from %d\n",
__func__, __LINE__, msg->slot);
__recover_slot(mddev, msg->slot);
break;
default:
pr_warn("%s:%d Received unknown message from %d\n",
__func__, __LINE__, msg->slot);
Expand All @@ -472,6 +495,7 @@ static void recv_daemon(struct md_thread *thread)
struct dlm_lock_resource *ack_lockres = cinfo->ack_lockres;
struct dlm_lock_resource *message_lockres = cinfo->message_lockres;
struct cluster_msg msg;
int ret;

/*get CR on Message*/
if (dlm_lock_sync(message_lockres, DLM_LOCK_CR)) {
Expand All @@ -484,13 +508,21 @@ static void recv_daemon(struct md_thread *thread)
process_recvd_msg(thread->mddev, &msg);

/*release CR on ack_lockres*/
dlm_unlock_sync(ack_lockres);
/*up-convert to EX on message_lockres*/
dlm_lock_sync(message_lockres, DLM_LOCK_EX);
ret = dlm_unlock_sync(ack_lockres);
if (unlikely(ret != 0))
pr_info("unlock ack failed return %d\n", ret);
/*up-convert to PR on message_lockres*/
ret = dlm_lock_sync(message_lockres, DLM_LOCK_PR);
if (unlikely(ret != 0))
pr_info("lock PR on msg failed return %d\n", ret);
/*get CR on ack_lockres again*/
dlm_lock_sync(ack_lockres, DLM_LOCK_CR);
ret = dlm_lock_sync(ack_lockres, DLM_LOCK_CR);
if (unlikely(ret != 0))
pr_info("lock CR on ack failed return %d\n", ret);
/*release CR on message_lockres*/
dlm_unlock_sync(message_lockres);
ret = dlm_unlock_sync(message_lockres);
if (unlikely(ret != 0))
pr_info("unlock msg failed return %d\n", ret);
}

/* lock_comm()
Expand Down Expand Up @@ -519,7 +551,7 @@ static void unlock_comm(struct md_cluster_info *cinfo)
* The function:
* 1. Grabs the message lockresource in EX mode
* 2. Copies the message to the message LVB
* 3. Downconverts message lockresource to CR
* 3. Downconverts message lockresource to CW
* 4. Upconverts ack lock resource from CR to EX. This forces the BAST on other nodes
* and the other nodes read the message. The thread will wait here until all other
* nodes have released ack lock resource.
Expand All @@ -540,12 +572,12 @@ static int __sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg)

memcpy(cinfo->message_lockres->lksb.sb_lvbptr, (void *)cmsg,
sizeof(struct cluster_msg));
/*down-convert EX to CR on Message*/
error = dlm_lock_sync(cinfo->message_lockres, DLM_LOCK_CR);
/*down-convert EX to CW on Message*/
error = dlm_lock_sync(cinfo->message_lockres, DLM_LOCK_CW);
if (error) {
pr_err("md-cluster: failed to convert EX to CR on MESSAGE(%d)\n",
pr_err("md-cluster: failed to convert EX to CW on MESSAGE(%d)\n",
error);
goto failed_message;
goto failed_ack;
}

/*up-convert CR to EX on Ack*/
Expand All @@ -565,7 +597,13 @@ static int __sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg)
}

failed_ack:
dlm_unlock_sync(cinfo->message_lockres);
error = dlm_unlock_sync(cinfo->message_lockres);
if (unlikely(error != 0)) {
pr_err("md-cluster: failed convert to NL on MESSAGE(%d)\n",
error);
/* in case the message can't be released due to some reason */
goto failed_ack;
}
failed_message:
return error;
}
Expand All @@ -587,6 +625,7 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots)
struct dlm_lock_resource *bm_lockres;
struct suspend_info *s;
char str[64];
sector_t lo, hi;


for (i = 0; i < total_slots; i++) {
Expand Down Expand Up @@ -617,9 +656,24 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots)
lockres_free(bm_lockres);
continue;
}
if (ret)
if (ret) {
lockres_free(bm_lockres);
goto out;
/* TODO: Read the disk bitmap sb and check if it needs recovery */
}

/* Read the disk bitmap sb and check if it needs recovery */
ret = bitmap_copy_from_slot(mddev, i, &lo, &hi, false);
if (ret) {
pr_warn("md-cluster: Could not gather bitmaps from slot %d", i);
lockres_free(bm_lockres);
continue;
}
if ((hi > 0) && (lo < mddev->recovery_cp)) {
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
mddev->recovery_cp = lo;
md_check_recovery(mddev);
}

dlm_unlock_sync(bm_lockres);
lockres_free(bm_lockres);
}
Expand All @@ -633,20 +687,20 @@ static int join(struct mddev *mddev, int nodes)
int ret, ops_rv;
char str[64];

if (!try_module_get(THIS_MODULE))
return -ENOENT;

cinfo = kzalloc(sizeof(struct md_cluster_info), GFP_KERNEL);
if (!cinfo)
return -ENOMEM;

INIT_LIST_HEAD(&cinfo->suspend_list);
spin_lock_init(&cinfo->suspend_lock);
init_completion(&cinfo->completion);
set_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state);

mutex_init(&cinfo->sb_mutex);
mddev->cluster_info = cinfo;

memset(str, 0, 64);
pretty_uuid(str, mddev->uuid);
sprintf(str, "%pU", mddev->uuid);
ret = dlm_new_lockspace(str, mddev->bitmap_info.cluster_name,
DLM_LSFL_FS, LVB_SIZE,
&md_ls_ops, mddev, &ops_rv, &cinfo->lockspace);
Expand All @@ -659,12 +713,6 @@ static int join(struct mddev *mddev, int nodes)
ret = -ERANGE;
goto err;
}
cinfo->sb_lock = lockres_init(mddev, "cmd-super",
NULL, 0);
if (!cinfo->sb_lock) {
ret = -ENOMEM;
goto err;
}
/* Initiate the communication resources */
ret = -ENOMEM;
cinfo->recv_thread = md_register_thread(recv_daemon, mddev, "cluster_recv");
Expand Down Expand Up @@ -705,9 +753,6 @@ static int join(struct mddev *mddev, int nodes)
goto err;
}

INIT_LIST_HEAD(&cinfo->suspend_list);
spin_lock_init(&cinfo->suspend_lock);

ret = gather_all_resync_info(mddev, nodes);
if (ret)
goto err;
Expand All @@ -719,12 +764,10 @@ static int join(struct mddev *mddev, int nodes)
lockres_free(cinfo->ack_lockres);
lockres_free(cinfo->no_new_dev_lockres);
lockres_free(cinfo->bitmap_lockres);
lockres_free(cinfo->sb_lock);
if (cinfo->lockspace)
dlm_release_lockspace(cinfo->lockspace, 2);
mddev->cluster_info = NULL;
kfree(cinfo);
module_put(THIS_MODULE);
return ret;
}

Expand All @@ -740,7 +783,6 @@ static int leave(struct mddev *mddev)
lockres_free(cinfo->token_lockres);
lockres_free(cinfo->ack_lockres);
lockres_free(cinfo->no_new_dev_lockres);
lockres_free(cinfo->sb_lock);
lockres_free(cinfo->bitmap_lockres);
dlm_release_lockspace(cinfo->lockspace, 2);
return 0;
Expand Down Expand Up @@ -817,8 +859,17 @@ static int resync_start(struct mddev *mddev, sector_t lo, sector_t hi)

static void resync_finish(struct mddev *mddev)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
struct cluster_msg cmsg;
int slot = cinfo->slot_number - 1;

pr_info("%s:%d\n", __func__, __LINE__);
resync_send(mddev, RESYNCING, 0, 0);
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
cmsg.type = cpu_to_le32(BITMAP_NEEDS_SYNC);
cmsg.slot = cpu_to_le32(slot);
sendmsg(cinfo, &cmsg);
}
}

static int area_resyncing(struct mddev *mddev, int direction,
Expand Down
Loading

0 comments on commit 2a013e3

Please sign in to comment.