From 525b20987ec20aee38ab148e9201632995445789 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 17 Dec 2012 15:59:22 -0800 Subject: [PATCH] --- yaml --- r: 346340 b: refs/heads/master c: d7124073add4cd04508f9ae3adc2746c61d7e78b h: refs/heads/master v: v3 --- [refs] | 2 +- .../arch/powerpc/platforms/cell/spufs/sched.c | 2 +- trunk/arch/um/drivers/mconsole_kern.c | 2 +- .../arch/x86/include/uapi/asm/hw_breakpoint.h | 1 + trunk/arch/x86/include/uapi/asm/setup.h | 1 + trunk/block/genhd.c | 8 +- trunk/block/partitions/efi.c | 7 +- trunk/block/partitions/msdos.c | 21 +- trunk/drivers/block/cciss.c | 21 +- trunk/drivers/block/drbd/Kconfig | 10 +- trunk/drivers/block/drbd/Makefile | 2 - trunk/drivers/block/drbd/drbd_actlog.c | 702 +-- trunk/drivers/block/drbd/drbd_bitmap.c | 249 +- trunk/drivers/block/drbd/drbd_int.h | 1365 +++--- trunk/drivers/block/drbd/drbd_interval.c | 207 - trunk/drivers/block/drbd/drbd_interval.h | 40 - trunk/drivers/block/drbd/drbd_main.c | 3855 +++++++++------- trunk/drivers/block/drbd/drbd_nl.c | 3334 ++++++-------- trunk/drivers/block/drbd/drbd_nla.c | 55 - trunk/drivers/block/drbd/drbd_nla.h | 8 - trunk/drivers/block/drbd/drbd_proc.c | 41 +- trunk/drivers/block/drbd/drbd_receiver.c | 3910 +++++++---------- trunk/drivers/block/drbd/drbd_req.c | 1574 +++---- trunk/drivers/block/drbd/drbd_req.h | 187 +- trunk/drivers/block/drbd/drbd_state.c | 1856 -------- trunk/drivers/block/drbd/drbd_state.h | 161 - trunk/drivers/block/drbd/drbd_strings.c | 1 - trunk/drivers/block/drbd/drbd_worker.c | 1237 +++--- trunk/drivers/block/drbd/drbd_wrappers.h | 11 +- trunk/drivers/block/loop.c | 10 - trunk/drivers/block/xen-blkback/blkback.c | 301 +- trunk/drivers/block/xen-blkback/common.h | 16 - trunk/drivers/block/xen-blkback/xenbus.c | 23 +- trunk/drivers/block/xen-blkfront.c | 199 +- trunk/drivers/firmware/efivars.c | 1 + trunk/drivers/md/md.c | 2 +- trunk/drivers/md/md.h | 26 + trunk/drivers/md/raid1.c | 15 +- trunk/drivers/md/raid10.c | 15 +- trunk/drivers/md/raid5.c | 12 +- trunk/drivers/staging/android/binder.c | 3 +- trunk/fs/attr.c | 11 +- trunk/fs/autofs4/autofs_i.h | 8 +- trunk/fs/autofs4/dev-ioctl.c | 4 +- trunk/fs/autofs4/inode.c | 24 +- trunk/fs/autofs4/waitq.c | 5 +- trunk/fs/exec.c | 9 +- trunk/fs/fuse/dev.c | 4 +- trunk/fs/fuse/dir.c | 20 +- trunk/fs/fuse/fuse_i.h | 4 +- trunk/fs/fuse/inode.c | 23 +- trunk/fs/hppfs/hppfs.c | 2 +- trunk/fs/mount.h | 3 - trunk/fs/namespace.c | 211 +- trunk/fs/open.c | 2 +- trunk/fs/pnode.h | 1 - trunk/fs/proc/Makefile | 1 - trunk/fs/proc/array.c | 2 +- trunk/fs/proc/base.c | 169 +- trunk/fs/proc/generic.c | 26 +- trunk/fs/proc/inode.c | 6 +- trunk/fs/proc/internal.h | 1 - trunk/fs/proc/namespaces.c | 185 +- trunk/fs/proc/root.c | 17 +- trunk/fs/proc/self.c | 59 - trunk/fs/sysfs/mount.c | 1 - trunk/include/linux/cred.h | 2 + trunk/include/linux/drbd.h | 81 +- trunk/include/linux/drbd_genl.h | 378 -- trunk/include/linux/drbd_genl_api.h | 55 - trunk/include/linux/drbd_limits.h | 90 +- trunk/include/linux/drbd_nl.h | 163 + trunk/include/linux/drbd_tag_magic.h | 84 + trunk/include/linux/fs.h | 2 - trunk/include/linux/genhd.h | 8 +- trunk/include/linux/genl_magic_func.h | 422 -- trunk/include/linux/genl_magic_struct.h | 277 -- trunk/include/linux/idr.h | 11 - trunk/include/linux/ipc_namespace.h | 9 +- trunk/include/linux/loop.h | 3 - trunk/include/linux/lru_cache.h | 67 +- trunk/include/linux/mnt_namespace.h | 3 +- trunk/include/linux/nsproxy.h | 2 +- trunk/include/linux/pid_namespace.h | 11 +- trunk/include/linux/proc_fs.h | 26 +- trunk/include/linux/user_namespace.h | 10 - trunk/include/linux/utsname.h | 7 +- trunk/include/linux/wait.h | 164 - trunk/include/net/net_namespace.h | 2 - trunk/init/Kconfig | 2 + trunk/init/do_mounts.c | 61 +- trunk/init/main.c | 1 + trunk/init/version.c | 2 - trunk/ipc/msgutil.c | 2 - trunk/ipc/namespace.c | 32 +- trunk/kernel/cgroup.c | 2 +- trunk/kernel/events/core.c | 2 +- trunk/kernel/exit.c | 12 + trunk/kernel/fork.c | 69 +- trunk/kernel/nsproxy.c | 36 +- trunk/kernel/pid.c | 47 +- trunk/kernel/pid_namespace.c | 112 +- trunk/kernel/ptrace.c | 10 +- trunk/kernel/sched/core.c | 10 +- trunk/kernel/signal.c | 2 +- trunk/kernel/sysctl_binary.c | 2 +- trunk/kernel/user.c | 2 - trunk/kernel/user_namespace.c | 147 +- trunk/kernel/utsname.c | 33 +- trunk/lib/lru_cache.c | 359 +- trunk/mm/migrate.c | 2 +- trunk/net/core/net_namespace.c | 31 +- trunk/security/yama/yama_lsm.c | 12 +- 113 files changed, 9166 insertions(+), 13992 deletions(-) delete mode 100644 trunk/drivers/block/drbd/drbd_interval.c delete mode 100644 trunk/drivers/block/drbd/drbd_interval.h delete mode 100644 trunk/drivers/block/drbd/drbd_nla.c delete mode 100644 trunk/drivers/block/drbd/drbd_nla.h delete mode 100644 trunk/drivers/block/drbd/drbd_state.c delete mode 100644 trunk/drivers/block/drbd/drbd_state.h delete mode 100644 trunk/fs/proc/self.c delete mode 100644 trunk/include/linux/drbd_genl.h delete mode 100644 trunk/include/linux/drbd_genl_api.h create mode 100644 trunk/include/linux/drbd_nl.h create mode 100644 trunk/include/linux/drbd_tag_magic.h delete mode 100644 trunk/include/linux/genl_magic_func.h delete mode 100644 trunk/include/linux/genl_magic_struct.h diff --git a/[refs] b/[refs] index 2595ffa3dd2c..8c6da8e09423 100644 --- a/[refs] +++ b/[refs] @@ -1,2 +1,2 @@ --- -refs/heads/master: 992956189de58cae9f2be40585bc25105cd7c5ad +refs/heads/master: d7124073add4cd04508f9ae3adc2746c61d7e78b diff --git a/trunk/arch/powerpc/platforms/cell/spufs/sched.c b/trunk/arch/powerpc/platforms/cell/spufs/sched.c index 25db92a8e1cf..965d381abd75 100644 --- a/trunk/arch/powerpc/platforms/cell/spufs/sched.c +++ b/trunk/arch/powerpc/platforms/cell/spufs/sched.c @@ -1094,7 +1094,7 @@ static int show_spu_loadavg(struct seq_file *s, void *private) LOAD_INT(c), LOAD_FRAC(c), count_active_contexts(), atomic_read(&nr_spu_contexts), - task_active_pid_ns(current)->last_pid); + current->nsproxy->pid_ns->last_pid); return 0; } diff --git a/trunk/arch/um/drivers/mconsole_kern.c b/trunk/arch/um/drivers/mconsole_kern.c index 4bd82ac0210f..49e3b49e552f 100644 --- a/trunk/arch/um/drivers/mconsole_kern.c +++ b/trunk/arch/um/drivers/mconsole_kern.c @@ -123,7 +123,7 @@ void mconsole_log(struct mc_request *req) void mconsole_proc(struct mc_request *req) { - struct vfsmount *mnt = task_active_pid_ns(current)->proc_mnt; + struct vfsmount *mnt = current->nsproxy->pid_ns->proc_mnt; char *buf; int len; struct file *file; diff --git a/trunk/arch/x86/include/uapi/asm/hw_breakpoint.h b/trunk/arch/x86/include/uapi/asm/hw_breakpoint.h index e69de29bb2d1..79a9626b5500 100644 --- a/trunk/arch/x86/include/uapi/asm/hw_breakpoint.h +++ b/trunk/arch/x86/include/uapi/asm/hw_breakpoint.h @@ -0,0 +1 @@ +/* */ diff --git a/trunk/arch/x86/include/uapi/asm/setup.h b/trunk/arch/x86/include/uapi/asm/setup.h index e69de29bb2d1..79a9626b5500 100644 --- a/trunk/arch/x86/include/uapi/asm/setup.h +++ b/trunk/arch/x86/include/uapi/asm/setup.h @@ -0,0 +1 @@ +/* */ diff --git a/trunk/block/genhd.c b/trunk/block/genhd.c index 9a289d7c84bb..2a6fdf539a69 100644 --- a/trunk/block/genhd.c +++ b/trunk/block/genhd.c @@ -743,6 +743,7 @@ void __init printk_all_partitions(void) struct hd_struct *part; char name_buf[BDEVNAME_SIZE]; char devt_buf[BDEVT_SIZE]; + char uuid_buf[PARTITION_META_INFO_UUIDLTH * 2 + 5]; /* * Don't show empty devices or things that have been @@ -761,11 +762,16 @@ void __init printk_all_partitions(void) while ((part = disk_part_iter_next(&piter))) { bool is_part0 = part == &disk->part0; + uuid_buf[0] = '\0'; + if (part->info) + snprintf(uuid_buf, sizeof(uuid_buf), "%pU", + part->info->uuid); + printk("%s%s %10llu %s %s", is_part0 ? "" : " ", bdevt_str(part_devt(part), devt_buf), (unsigned long long)part_nr_sects_read(part) >> 1 , disk_name(disk, part->partno, name_buf), - part->info ? part->info->uuid : ""); + uuid_buf); if (is_part0) { if (disk->driverfs_dev != NULL && disk->driverfs_dev->driver != NULL) diff --git a/trunk/block/partitions/efi.c b/trunk/block/partitions/efi.c index b62fb88b8711..6296b403c67a 100644 --- a/trunk/block/partitions/efi.c +++ b/trunk/block/partitions/efi.c @@ -620,6 +620,7 @@ int efi_partition(struct parsed_partitions *state) gpt_entry *ptes = NULL; u32 i; unsigned ssz = bdev_logical_block_size(state->bdev) / 512; + u8 unparsed_guid[37]; if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) { kfree(gpt); @@ -648,7 +649,11 @@ int efi_partition(struct parsed_partitions *state) state->parts[i + 1].flags = ADDPART_FLAG_RAID; info = &state->parts[i + 1].info; - efi_guid_unparse(&ptes[i].unique_partition_guid, info->uuid); + /* Instead of doing a manual swap to big endian, reuse the + * common ASCII hex format as the interim. + */ + efi_guid_unparse(&ptes[i].unique_partition_guid, unparsed_guid); + part_pack_uuid(unparsed_guid, info->uuid); /* Naively convert UTF16-LE to 7 bits. */ label_max = min(sizeof(info->volname) - 1, diff --git a/trunk/block/partitions/msdos.c b/trunk/block/partitions/msdos.c index 8752a5d26565..5f79a6677c69 100644 --- a/trunk/block/partitions/msdos.c +++ b/trunk/block/partitions/msdos.c @@ -94,17 +94,6 @@ static int aix_magic_present(struct parsed_partitions *state, unsigned char *p) return ret; } -static void set_info(struct parsed_partitions *state, int slot, - u32 disksig) -{ - struct partition_meta_info *info = &state->parts[slot].info; - - snprintf(info->uuid, sizeof(info->uuid), "%08x-%02x", disksig, - slot); - info->volname[0] = 0; - state->parts[slot].has_info = true; -} - /* * Create devices for each logical partition in an extended partition. * The logical partitions form a linked list, with each entry being @@ -117,8 +106,7 @@ static void set_info(struct parsed_partitions *state, int slot, */ static void parse_extended(struct parsed_partitions *state, - sector_t first_sector, sector_t first_size, - u32 disksig) + sector_t first_sector, sector_t first_size) { struct partition *p; Sector sect; @@ -178,7 +166,6 @@ static void parse_extended(struct parsed_partitions *state, } put_partition(state, state->next, next, size); - set_info(state, state->next, disksig); if (SYS_IND(p) == LINUX_RAID_PARTITION) state->parts[state->next].flags = ADDPART_FLAG_RAID; loopct = 0; @@ -450,7 +437,6 @@ int msdos_partition(struct parsed_partitions *state) struct partition *p; struct fat_boot_sector *fb; int slot; - u32 disksig; data = read_part_sector(state, 0, §); if (!data) @@ -505,8 +491,6 @@ int msdos_partition(struct parsed_partitions *state) #endif p = (struct partition *) (data + 0x1be); - disksig = le32_to_cpup((__le32 *)(data + 0x1b8)); - /* * Look for partitions in two passes: * First find the primary and DOS-type extended partitions. @@ -531,12 +515,11 @@ int msdos_partition(struct parsed_partitions *state) put_partition(state, slot, start, n); strlcat(state->pp_buf, " <", PAGE_SIZE); - parse_extended(state, start, size, disksig); + parse_extended(state, start, size); strlcat(state->pp_buf, " >", PAGE_SIZE); continue; } put_partition(state, slot, start, size); - set_info(state, slot, disksig); if (SYS_IND(p) == LINUX_RAID_PARTITION) state->parts[slot].flags = ADDPART_FLAG_RAID; if (SYS_IND(p) == DM6_PARTITION) diff --git a/trunk/drivers/block/cciss.c b/trunk/drivers/block/cciss.c index 6526157edafc..ca83f96756ad 100644 --- a/trunk/drivers/block/cciss.c +++ b/trunk/drivers/block/cciss.c @@ -41,9 +41,8 @@ #include #include #include -#include -#include #include +#include #include #include @@ -979,7 +978,8 @@ static CommandList_struct *cmd_alloc(ctlr_info_t *h) i = find_first_zero_bit(h->cmd_pool_bits, h->nr_cmds); if (i == h->nr_cmds) return NULL; - } while (test_and_set_bit(i, h->cmd_pool_bits) != 0); + } while (test_and_set_bit(i & (BITS_PER_LONG - 1), + h->cmd_pool_bits + (i / BITS_PER_LONG)) != 0); c = h->cmd_pool + i; memset(c, 0, sizeof(CommandList_struct)); cmd_dma_handle = h->cmd_pool_dhandle + i * sizeof(CommandList_struct); @@ -1046,7 +1046,8 @@ static void cmd_free(ctlr_info_t *h, CommandList_struct *c) int i; i = c - h->cmd_pool; - clear_bit(i, h->cmd_pool_bits); + clear_bit(i & (BITS_PER_LONG - 1), + h->cmd_pool_bits + (i / BITS_PER_LONG)); h->nr_frees++; } @@ -4267,7 +4268,10 @@ static void __devinit cciss_find_board_params(ctlr_info_t *h) static inline bool CISS_signature_present(ctlr_info_t *h) { - if (!check_signature(h->cfgtable->Signature, "CISS", 4)) { + if ((readb(&h->cfgtable->Signature[0]) != 'C') || + (readb(&h->cfgtable->Signature[1]) != 'I') || + (readb(&h->cfgtable->Signature[2]) != 'S') || + (readb(&h->cfgtable->Signature[3]) != 'S')) { dev_warn(&h->pdev->dev, "not a valid CISS config table\n"); return false; } @@ -4808,7 +4812,8 @@ static __devinit int cciss_init_reset_devices(struct pci_dev *pdev) static __devinit int cciss_allocate_cmd_pool(ctlr_info_t *h) { - h->cmd_pool_bits = kmalloc(BITS_TO_LONGS(h->nr_cmds) * + h->cmd_pool_bits = kmalloc( + DIV_ROUND_UP(h->nr_cmds, BITS_PER_LONG) * sizeof(unsigned long), GFP_KERNEL); h->cmd_pool = pci_alloc_consistent(h->pdev, h->nr_cmds * sizeof(CommandList_struct), @@ -5063,7 +5068,9 @@ static int __devinit cciss_init_one(struct pci_dev *pdev, pci_set_drvdata(pdev, h); /* command and error info recs zeroed out before they are used */ - bitmap_zero(h->cmd_pool_bits, h->nr_cmds); + memset(h->cmd_pool_bits, 0, + DIV_ROUND_UP(h->nr_cmds, BITS_PER_LONG) + * sizeof(unsigned long)); h->num_luns = 0; h->highest_lun = -1; diff --git a/trunk/drivers/block/drbd/Kconfig b/trunk/drivers/block/drbd/Kconfig index 7845bd6ee414..df0983787390 100644 --- a/trunk/drivers/block/drbd/Kconfig +++ b/trunk/drivers/block/drbd/Kconfig @@ -2,14 +2,13 @@ # DRBD device driver configuration # -comment "DRBD disabled because PROC_FS or INET not selected" - depends on PROC_FS='n' || INET='n' +comment "DRBD disabled because PROC_FS, INET or CONNECTOR not selected" + depends on PROC_FS='n' || INET='n' || CONNECTOR='n' config BLK_DEV_DRBD tristate "DRBD Distributed Replicated Block Device support" - depends on PROC_FS && INET + depends on PROC_FS && INET && CONNECTOR select LRU_CACHE - select LIBCRC32C default n help @@ -59,8 +58,7 @@ config DRBD_FAULT_INJECTION 32 data read 64 read ahead 128 kmalloc of bitmap - 256 allocation of peer_requests - 512 insert data corruption on receiving side + 256 allocation of EE (epoch_entries) fault_devs: bitmask of minor numbers fault_rate: frequency in percent diff --git a/trunk/drivers/block/drbd/Makefile b/trunk/drivers/block/drbd/Makefile index 8b450338075e..0d3f337ff5ff 100644 --- a/trunk/drivers/block/drbd/Makefile +++ b/trunk/drivers/block/drbd/Makefile @@ -1,7 +1,5 @@ drbd-y := drbd_bitmap.o drbd_proc.o drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o drbd-y += drbd_main.o drbd_strings.o drbd_nl.o -drbd-y += drbd_interval.o drbd_state.o -drbd-y += drbd_nla.o obj-$(CONFIG_BLK_DEV_DRBD) += drbd.o diff --git a/trunk/drivers/block/drbd/drbd_actlog.c b/trunk/drivers/block/drbd/drbd_actlog.c index 92510f8ad013..3fbef018ce55 100644 --- a/trunk/drivers/block/drbd/drbd_actlog.c +++ b/trunk/drivers/block/drbd/drbd_actlog.c @@ -24,73 +24,21 @@ */ #include -#include #include -#include -#include #include "drbd_int.h" #include "drbd_wrappers.h" - -enum al_transaction_types { - AL_TR_UPDATE = 0, - AL_TR_INITIALIZED = 0xffff -}; -/* all fields on disc in big endian */ -struct __packed al_transaction_on_disk { - /* don't we all like magic */ - __be32 magic; - - /* to identify the most recent transaction block - * in the on disk ring buffer */ - __be32 tr_number; - - /* checksum on the full 4k block, with this field set to 0. */ - __be32 crc32c; - - /* type of transaction, special transaction types like: - * purge-all, set-all-idle, set-all-active, ... to-be-defined - * see also enum al_transaction_types */ - __be16 transaction_type; - - /* we currently allow only a few thousand extents, - * so 16bit will be enough for the slot number. */ - - /* how many updates in this transaction */ - __be16 n_updates; - - /* maximum slot number, "al-extents" in drbd.conf speak. - * Having this in each transaction should make reconfiguration - * of that parameter easier. */ - __be16 context_size; - - /* slot number the context starts with */ - __be16 context_start_slot_nr; - - /* Some reserved bytes. Expected usage is a 64bit counter of - * sectors-written since device creation, and other data generation tag - * supporting usage */ - __be32 __reserved[4]; - - /* --- 36 byte used --- */ - - /* Reserve space for up to AL_UPDATES_PER_TRANSACTION changes - * in one transaction, then use the remaining byte in the 4k block for - * context information. "Flexible" number of updates per transaction - * does not help, as we have to account for the case when all update - * slots are used anyways, so it would only complicate code without - * additional benefit. - */ - __be16 update_slot_nr[AL_UPDATES_PER_TRANSACTION]; - - /* but the extent number is 32bit, which at an extent size of 4 MiB - * allows to cover device sizes of up to 2**54 Byte (16 PiB) */ - __be32 update_extent_nr[AL_UPDATES_PER_TRANSACTION]; - - /* --- 420 bytes used (36 + 64*6) --- */ - - /* 4096 - 420 = 3676 = 919 * 4 */ - __be32 context[AL_CONTEXT_PER_TRANSACTION]; +/* We maintain a trivial checksum in our on disk activity log. + * With that we can ensure correct operation even when the storage + * device might do a partial (last) sector write while losing power. + */ +struct __packed al_transaction { + u32 magic; + u32 tr_number; + struct __packed { + u32 pos; + u32 extent; } updates[1 + AL_EXTENTS_PT]; + u32 xor_sum; }; struct update_odbm_work { @@ -100,11 +48,22 @@ struct update_odbm_work { struct update_al_work { struct drbd_work w; + struct lc_element *al_ext; struct completion event; - int err; + unsigned int enr; + /* if old_enr != LC_FREE, write corresponding bitmap sector, too */ + unsigned int old_enr; +}; + +struct drbd_atodb_wait { + atomic_t count; + struct completion io_done; + struct drbd_conf *mdev; + int error; }; -static int al_write_transaction(struct drbd_conf *mdev); + +int w_al_write_transaction(struct drbd_conf *, struct drbd_work *, int); void *drbd_md_get_buffer(struct drbd_conf *mdev) { @@ -123,24 +82,22 @@ void drbd_md_put_buffer(struct drbd_conf *mdev) wake_up(&mdev->misc_wait); } -void wait_until_done_or_force_detached(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, - unsigned int *done) +static bool md_io_allowed(struct drbd_conf *mdev) { - long dt; + enum drbd_disk_state ds = mdev->state.disk; + return ds >= D_NEGOTIATING || ds == D_ATTACHING; +} - rcu_read_lock(); - dt = rcu_dereference(bdev->disk_conf)->disk_timeout; - rcu_read_unlock(); - dt = dt * HZ / 10; +void wait_until_done_or_disk_failure(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, + unsigned int *done) +{ + long dt = bdev->dc.disk_timeout * HZ / 10; if (dt == 0) dt = MAX_SCHEDULE_TIMEOUT; - dt = wait_event_timeout(mdev->misc_wait, - *done || test_bit(FORCE_DETACH, &mdev->flags), dt); - if (dt == 0) { + dt = wait_event_timeout(mdev->misc_wait, *done || !md_io_allowed(mdev), dt); + if (dt == 0) dev_err(DEV, "meta-data IO operation timed out\n"); - drbd_chk_io_error(mdev, 1, DRBD_FORCE_DETACH); - } } static int _drbd_md_sync_page_io(struct drbd_conf *mdev, @@ -149,7 +106,7 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev, int rw, int size) { struct bio *bio; - int err; + int ok; mdev->md_io.done = 0; mdev->md_io.error = -ENODEV; @@ -161,8 +118,8 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev, bio = bio_alloc_drbd(GFP_NOIO); bio->bi_bdev = bdev->md_bdev; bio->bi_sector = sector; - err = -EIO; - if (bio_add_page(bio, page, size, 0) != size) + ok = (bio_add_page(bio, page, size, 0) == size); + if (!ok) goto out; bio->bi_private = &mdev->md_io; bio->bi_end_io = drbd_md_io_complete; @@ -170,7 +127,7 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev, if (!get_ldev_if_state(mdev, D_ATTACHING)) { /* Corresponding put_ldev in drbd_md_io_complete() */ dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n"); - err = -ENODEV; + ok = 0; goto out; } @@ -180,47 +137,86 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev, bio_endio(bio, -EIO); else submit_bio(rw, bio); - wait_until_done_or_force_detached(mdev, bdev, &mdev->md_io.done); - if (bio_flagged(bio, BIO_UPTODATE)) - err = mdev->md_io.error; + wait_until_done_or_disk_failure(mdev, bdev, &mdev->md_io.done); + ok = bio_flagged(bio, BIO_UPTODATE) && mdev->md_io.error == 0; out: bio_put(bio); - return err; + return ok; } int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, sector_t sector, int rw) { - int err; + int logical_block_size, mask, ok; + int offset = 0; struct page *iop = mdev->md_io_page; D_ASSERT(atomic_read(&mdev->md_io_in_use) == 1); BUG_ON(!bdev->md_bdev); - dev_dbg(DEV, "meta_data io: %s [%d]:%s(,%llus,%s)\n", - current->comm, current->pid, __func__, - (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); + logical_block_size = bdev_logical_block_size(bdev->md_bdev); + if (logical_block_size == 0) + logical_block_size = MD_SECTOR_SIZE; + + /* in case logical_block_size != 512 [ s390 only? ] */ + if (logical_block_size != MD_SECTOR_SIZE) { + mask = (logical_block_size / MD_SECTOR_SIZE) - 1; + D_ASSERT(mask == 1 || mask == 3 || mask == 7); + D_ASSERT(logical_block_size == (mask+1) * MD_SECTOR_SIZE); + offset = sector & mask; + sector = sector & ~mask; + iop = mdev->md_io_tmpp; + + if (rw & WRITE) { + /* these are GFP_KERNEL pages, pre-allocated + * on device initialization */ + void *p = page_address(mdev->md_io_page); + void *hp = page_address(mdev->md_io_tmpp); + + ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector, + READ, logical_block_size); + + if (unlikely(!ok)) { + dev_err(DEV, "drbd_md_sync_page_io(,%llus," + "READ [logical_block_size!=512]) failed!\n", + (unsigned long long)sector); + return 0; + } + + memcpy(hp + offset*MD_SECTOR_SIZE, p, MD_SECTOR_SIZE); + } + } if (sector < drbd_md_first_sector(bdev) || - sector + 7 > drbd_md_last_sector(bdev)) + sector > drbd_md_last_sector(bdev)) dev_alert(DEV, "%s [%d]:%s(,%llus,%s) out of range md access!\n", current->comm, current->pid, __func__, (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); - err = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, MD_BLOCK_SIZE); - if (err) { - dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n", - (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", err); + ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, logical_block_size); + if (unlikely(!ok)) { + dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed!\n", + (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); + return 0; + } + + if (logical_block_size != MD_SECTOR_SIZE && !(rw & WRITE)) { + void *p = page_address(mdev->md_io_page); + void *hp = page_address(mdev->md_io_tmpp); + + memcpy(p, hp + offset*MD_SECTOR_SIZE, MD_SECTOR_SIZE); } - return err; + + return ok; } static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr) { struct lc_element *al_ext; struct lc_element *tmp; + unsigned long al_flags = 0; int wake; spin_lock_irq(&mdev->al_lock); @@ -235,92 +231,76 @@ static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr) return NULL; } } - al_ext = lc_get(mdev->act_log, enr); + al_ext = lc_get(mdev->act_log, enr); + al_flags = mdev->act_log->flags; spin_unlock_irq(&mdev->al_lock); + + /* + if (!al_ext) { + if (al_flags & LC_STARVING) + dev_warn(DEV, "Have to wait for LRU element (AL too small?)\n"); + if (al_flags & LC_DIRTY) + dev_warn(DEV, "Ongoing AL update (AL device too slow?)\n"); + } + */ + return al_ext; } -void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i) +void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector) { - /* for bios crossing activity log extent boundaries, - * we may need to activate two extents in one go */ - unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); - unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); - unsigned enr; - bool locked = false; - + unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9)); + struct lc_element *al_ext; + struct update_al_work al_work; - D_ASSERT(first <= last); D_ASSERT(atomic_read(&mdev->local_cnt) > 0); - for (enr = first; enr <= last; enr++) - wait_event(mdev->al_wait, _al_get(mdev, enr) != NULL); - - /* Serialize multiple transactions. - * This uses test_and_set_bit, memory barrier is implicit. - */ - wait_event(mdev->al_wait, - mdev->act_log->pending_changes == 0 || - (locked = lc_try_lock_for_transaction(mdev->act_log))); + wait_event(mdev->al_wait, (al_ext = _al_get(mdev, enr))); - if (locked) { + if (al_ext->lc_number != enr) { /* drbd_al_write_transaction(mdev,al_ext,enr); * recurses into generic_make_request(), which * disallows recursion, bios being serialized on the * current->bio_tail list now. * we have to delegate updates to the activity log * to the worker thread. */ - - /* Double check: it may have been committed by someone else, - * while we have been waiting for the lock. */ - if (mdev->act_log->pending_changes) { - bool write_al_updates; - - rcu_read_lock(); - write_al_updates = rcu_dereference(mdev->ldev->disk_conf)->al_updates; - rcu_read_unlock(); - - if (write_al_updates) { - al_write_transaction(mdev); - mdev->al_writ_cnt++; - } - - spin_lock_irq(&mdev->al_lock); - /* FIXME - if (err) - we need an "lc_cancel" here; - */ - lc_committed(mdev->act_log); - spin_unlock_irq(&mdev->al_lock); - } - lc_unlock(mdev->act_log); + init_completion(&al_work.event); + al_work.al_ext = al_ext; + al_work.enr = enr; + al_work.old_enr = al_ext->lc_number; + al_work.w.cb = w_al_write_transaction; + drbd_queue_work_front(&mdev->data.work, &al_work.w); + wait_for_completion(&al_work.event); + + mdev->al_writ_cnt++; + + spin_lock_irq(&mdev->al_lock); + lc_changed(mdev->act_log, al_ext); + spin_unlock_irq(&mdev->al_lock); wake_up(&mdev->al_wait); } } -void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i) +void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector) { - /* for bios crossing activity log extent boundaries, - * we may need to activate two extents in one go */ - unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); - unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); - unsigned enr; + unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9)); struct lc_element *extent; unsigned long flags; - D_ASSERT(first <= last); spin_lock_irqsave(&mdev->al_lock, flags); - for (enr = first; enr <= last; enr++) { - extent = lc_find(mdev->act_log, enr); - if (!extent) { - dev_err(DEV, "al_complete_io() called on inactive extent %u\n", enr); - continue; - } - lc_put(mdev->act_log, extent); + extent = lc_find(mdev->act_log, enr); + + if (!extent) { + spin_unlock_irqrestore(&mdev->al_lock, flags); + dev_err(DEV, "al_complete_io() called on inactive extent %u\n", enr); + return; } + + if (lc_put(mdev->act_log, extent) == 0) + wake_up(&mdev->al_wait); + spin_unlock_irqrestore(&mdev->al_lock, flags); - wake_up(&mdev->al_wait); } #if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT) @@ -346,148 +326,296 @@ static unsigned int rs_extent_to_bm_page(unsigned int rs_enr) return rs_enr >> /* bit to page */ ((PAGE_SHIFT + 3) - - /* resync extent number to bit */ + /* al extent number to bit */ (BM_EXT_SHIFT - BM_BLOCK_SHIFT)); } -static int -_al_write_transaction(struct drbd_conf *mdev) +int +w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused) { - struct al_transaction_on_disk *buffer; - struct lc_element *e; + struct update_al_work *aw = container_of(w, struct update_al_work, w); + struct lc_element *updated = aw->al_ext; + const unsigned int new_enr = aw->enr; + const unsigned int evicted = aw->old_enr; + struct al_transaction *buffer; sector_t sector; - int i, mx; - unsigned extent_nr; - unsigned crc = 0; - int err = 0; + int i, n, mx; + unsigned int extent_nr; + u32 xor_sum = 0; if (!get_ldev(mdev)) { - dev_err(DEV, "disk is %s, cannot start al transaction\n", - drbd_disk_str(mdev->state.disk)); - return -EIO; + dev_err(DEV, + "disk is %s, cannot start al transaction (-%d +%d)\n", + drbd_disk_str(mdev->state.disk), evicted, new_enr); + complete(&((struct update_al_work *)w)->event); + return 1; } + /* do we have to do a bitmap write, first? + * TODO reduce maximum latency: + * submit both bios, then wait for both, + * instead of doing two synchronous sector writes. + * For now, we must not write the transaction, + * if we cannot write out the bitmap of the evicted extent. */ + if (mdev->state.conn < C_CONNECTED && evicted != LC_FREE) + drbd_bm_write_page(mdev, al_extent_to_bm_page(evicted)); /* The bitmap write may have failed, causing a state change. */ if (mdev->state.disk < D_INCONSISTENT) { dev_err(DEV, - "disk is %s, cannot write al transaction\n", - drbd_disk_str(mdev->state.disk)); + "disk is %s, cannot write al transaction (-%d +%d)\n", + drbd_disk_str(mdev->state.disk), evicted, new_enr); + complete(&((struct update_al_work *)w)->event); put_ldev(mdev); - return -EIO; + return 1; } buffer = drbd_md_get_buffer(mdev); /* protects md_io_buffer, al_tr_cycle, ... */ if (!buffer) { dev_err(DEV, "disk failed while waiting for md_io buffer\n"); + complete(&((struct update_al_work *)w)->event); put_ldev(mdev); - return -ENODEV; + return 1; } - memset(buffer, 0, sizeof(*buffer)); - buffer->magic = cpu_to_be32(DRBD_AL_MAGIC); + buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC); buffer->tr_number = cpu_to_be32(mdev->al_tr_number); - i = 0; - - /* Even though no one can start to change this list - * once we set the LC_LOCKED -- from drbd_al_begin_io(), - * lc_try_lock_for_transaction() --, someone may still - * be in the process of changing it. */ - spin_lock_irq(&mdev->al_lock); - list_for_each_entry(e, &mdev->act_log->to_be_changed, list) { - if (i == AL_UPDATES_PER_TRANSACTION) { - i++; - break; - } - buffer->update_slot_nr[i] = cpu_to_be16(e->lc_index); - buffer->update_extent_nr[i] = cpu_to_be32(e->lc_new_number); - if (e->lc_number != LC_FREE) - drbd_bm_mark_for_writeout(mdev, - al_extent_to_bm_page(e->lc_number)); - i++; - } - spin_unlock_irq(&mdev->al_lock); - BUG_ON(i > AL_UPDATES_PER_TRANSACTION); + n = lc_index_of(mdev->act_log, updated); - buffer->n_updates = cpu_to_be16(i); - for ( ; i < AL_UPDATES_PER_TRANSACTION; i++) { - buffer->update_slot_nr[i] = cpu_to_be16(-1); - buffer->update_extent_nr[i] = cpu_to_be32(LC_FREE); - } + buffer->updates[0].pos = cpu_to_be32(n); + buffer->updates[0].extent = cpu_to_be32(new_enr); - buffer->context_size = cpu_to_be16(mdev->act_log->nr_elements); - buffer->context_start_slot_nr = cpu_to_be16(mdev->al_tr_cycle); + xor_sum ^= new_enr; - mx = min_t(int, AL_CONTEXT_PER_TRANSACTION, + mx = min_t(int, AL_EXTENTS_PT, mdev->act_log->nr_elements - mdev->al_tr_cycle); for (i = 0; i < mx; i++) { unsigned idx = mdev->al_tr_cycle + i; extent_nr = lc_element_by_index(mdev->act_log, idx)->lc_number; - buffer->context[i] = cpu_to_be32(extent_nr); + buffer->updates[i+1].pos = cpu_to_be32(idx); + buffer->updates[i+1].extent = cpu_to_be32(extent_nr); + xor_sum ^= extent_nr; } - for (; i < AL_CONTEXT_PER_TRANSACTION; i++) - buffer->context[i] = cpu_to_be32(LC_FREE); - - mdev->al_tr_cycle += AL_CONTEXT_PER_TRANSACTION; + for (; i < AL_EXTENTS_PT; i++) { + buffer->updates[i+1].pos = __constant_cpu_to_be32(-1); + buffer->updates[i+1].extent = __constant_cpu_to_be32(LC_FREE); + xor_sum ^= LC_FREE; + } + mdev->al_tr_cycle += AL_EXTENTS_PT; if (mdev->al_tr_cycle >= mdev->act_log->nr_elements) mdev->al_tr_cycle = 0; - sector = mdev->ldev->md.md_offset - + mdev->ldev->md.al_offset - + mdev->al_tr_pos * (MD_BLOCK_SIZE>>9); + buffer->xor_sum = cpu_to_be32(xor_sum); - crc = crc32c(0, buffer, 4096); - buffer->crc32c = cpu_to_be32(crc); + sector = mdev->ldev->md.md_offset + + mdev->ldev->md.al_offset + mdev->al_tr_pos; - if (drbd_bm_write_hinted(mdev)) - err = -EIO; - /* drbd_chk_io_error done already */ - else if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) { - err = -EIO; + if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR); - } else { - /* advance ringbuffer position and transaction counter */ - mdev->al_tr_pos = (mdev->al_tr_pos + 1) % (MD_AL_SECTORS*512/MD_BLOCK_SIZE); - mdev->al_tr_number++; - } + + if (++mdev->al_tr_pos > + div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT)) + mdev->al_tr_pos = 0; + + D_ASSERT(mdev->al_tr_pos < MD_AL_MAX_SIZE); + mdev->al_tr_number++; drbd_md_put_buffer(mdev); + + complete(&((struct update_al_work *)w)->event); put_ldev(mdev); - return err; + return 1; } +/** + * drbd_al_read_tr() - Read a single transaction from the on disk activity log + * @mdev: DRBD device. + * @bdev: Block device to read form. + * @b: pointer to an al_transaction. + * @index: On disk slot of the transaction to read. + * + * Returns -1 on IO error, 0 on checksum error and 1 upon success. + */ +static int drbd_al_read_tr(struct drbd_conf *mdev, + struct drbd_backing_dev *bdev, + struct al_transaction *b, + int index) +{ + sector_t sector; + int rv, i; + u32 xor_sum = 0; + + sector = bdev->md.md_offset + bdev->md.al_offset + index; + + /* Dont process error normally, + * as this is done before disk is attached! */ + if (!drbd_md_sync_page_io(mdev, bdev, sector, READ)) + return -1; + + rv = (be32_to_cpu(b->magic) == DRBD_MAGIC); + + for (i = 0; i < AL_EXTENTS_PT + 1; i++) + xor_sum ^= be32_to_cpu(b->updates[i].extent); + rv &= (xor_sum == be32_to_cpu(b->xor_sum)); -static int w_al_write_transaction(struct drbd_work *w, int unused) + return rv; +} + +/** + * drbd_al_read_log() - Restores the activity log from its on disk representation. + * @mdev: DRBD device. + * @bdev: Block device to read form. + * + * Returns 1 on success, returns 0 when reading the log failed due to IO errors. + */ +int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) { - struct update_al_work *aw = container_of(w, struct update_al_work, w); - struct drbd_conf *mdev = w->mdev; - int err; + struct al_transaction *buffer; + int i; + int rv; + int mx; + int active_extents = 0; + int transactions = 0; + int found_valid = 0; + int from = 0; + int to = 0; + u32 from_tnr = 0; + u32 to_tnr = 0; + u32 cnr; + + mx = div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT); + + /* lock out all other meta data io for now, + * and make sure the page is mapped. + */ + buffer = drbd_md_get_buffer(mdev); + if (!buffer) + return 0; + + /* Find the valid transaction in the log */ + for (i = 0; i <= mx; i++) { + rv = drbd_al_read_tr(mdev, bdev, buffer, i); + if (rv == 0) + continue; + if (rv == -1) { + drbd_md_put_buffer(mdev); + return 0; + } + cnr = be32_to_cpu(buffer->tr_number); + + if (++found_valid == 1) { + from = i; + to = i; + from_tnr = cnr; + to_tnr = cnr; + continue; + } + if ((int)cnr - (int)from_tnr < 0) { + D_ASSERT(from_tnr - cnr + i - from == mx+1); + from = i; + from_tnr = cnr; + } + if ((int)cnr - (int)to_tnr > 0) { + D_ASSERT(cnr - to_tnr == i - to); + to = i; + to_tnr = cnr; + } + } + + if (!found_valid) { + dev_warn(DEV, "No usable activity log found.\n"); + drbd_md_put_buffer(mdev); + return 1; + } + + /* Read the valid transactions. + * dev_info(DEV, "Reading from %d to %d.\n",from,to); */ + i = from; + while (1) { + int j, pos; + unsigned int extent_nr; + unsigned int trn; + + rv = drbd_al_read_tr(mdev, bdev, buffer, i); + ERR_IF(rv == 0) goto cancel; + if (rv == -1) { + drbd_md_put_buffer(mdev); + return 0; + } + + trn = be32_to_cpu(buffer->tr_number); + + spin_lock_irq(&mdev->al_lock); + + /* This loop runs backwards because in the cyclic + elements there might be an old version of the + updated element (in slot 0). So the element in slot 0 + can overwrite old versions. */ + for (j = AL_EXTENTS_PT; j >= 0; j--) { + pos = be32_to_cpu(buffer->updates[j].pos); + extent_nr = be32_to_cpu(buffer->updates[j].extent); + + if (extent_nr == LC_FREE) + continue; + + lc_set(mdev->act_log, extent_nr, pos); + active_extents++; + } + spin_unlock_irq(&mdev->al_lock); + + transactions++; + +cancel: + if (i == to) + break; + i++; + if (i > mx) + i = 0; + } + + mdev->al_tr_number = to_tnr+1; + mdev->al_tr_pos = to; + if (++mdev->al_tr_pos > + div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT)) + mdev->al_tr_pos = 0; + + /* ok, we are done with it */ + drbd_md_put_buffer(mdev); - err = _al_write_transaction(mdev); - aw->err = err; - complete(&aw->event); + dev_info(DEV, "Found %d transactions (%d active extents) in activity log.\n", + transactions, active_extents); - return err != -EIO ? err : 0; + return 1; } -/* Calls from worker context (see w_restart_disk_io()) need to write the - transaction directly. Others came through generic_make_request(), - those need to delegate it to the worker. */ -static int al_write_transaction(struct drbd_conf *mdev) +/** + * drbd_al_apply_to_bm() - Sets the bitmap to diry(1) where covered ba active AL extents + * @mdev: DRBD device. + */ +void drbd_al_apply_to_bm(struct drbd_conf *mdev) { - struct update_al_work al_work; + unsigned int enr; + unsigned long add = 0; + char ppb[10]; + int i, tmp; + + wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); - if (current == mdev->tconn->worker.task) - return _al_write_transaction(mdev); + for (i = 0; i < mdev->act_log->nr_elements; i++) { + enr = lc_element_by_index(mdev->act_log, i)->lc_number; + if (enr == LC_FREE) + continue; + tmp = drbd_bm_ALe_set_all(mdev, enr); + dynamic_dev_dbg(DEV, "AL: set %d bits in extent %u\n", tmp, enr); + add += tmp; + } - init_completion(&al_work.event); - al_work.w.cb = w_al_write_transaction; - al_work.w.mdev = mdev; - drbd_queue_work_front(&mdev->tconn->sender_work, &al_work.w); - wait_for_completion(&al_work.event); + lc_unlock(mdev->act_log); + wake_up(&mdev->al_wait); - return al_work.err; + dev_info(DEV, "Marked additional %s as out-of-sync based on AL.\n", + ppsize(ppb, Bit2KB(add))); } static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext) @@ -517,7 +645,7 @@ void drbd_al_shrink(struct drbd_conf *mdev) struct lc_element *al_ext; int i; - D_ASSERT(test_bit(__LC_LOCKED, &mdev->act_log->flags)); + D_ASSERT(test_bit(__LC_DIRTY, &mdev->act_log->flags)); for (i = 0; i < mdev->act_log->nr_elements; i++) { al_ext = lc_element_by_index(mdev->act_log, i); @@ -529,17 +657,15 @@ void drbd_al_shrink(struct drbd_conf *mdev) wake_up(&mdev->al_wait); } -static int w_update_odbm(struct drbd_work *w, int unused) +static int w_update_odbm(struct drbd_conf *mdev, struct drbd_work *w, int unused) { struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w); - struct drbd_conf *mdev = w->mdev; - struct sib_info sib = { .sib_reason = SIB_SYNC_PROGRESS, }; if (!get_ldev(mdev)) { if (__ratelimit(&drbd_ratelimit_state)) dev_warn(DEV, "Can not update on disk bitmap, local IO disabled.\n"); kfree(udw); - return 0; + return 1; } drbd_bm_write_page(mdev, rs_extent_to_bm_page(udw->enr)); @@ -557,9 +683,9 @@ static int w_update_odbm(struct drbd_work *w, int unused) break; } } - drbd_bcast_event(mdev, &sib); + drbd_bcast_sync_progress(mdev); - return 0; + return 1; } @@ -629,9 +755,7 @@ static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector, } ext->rs_left = rs_left; ext->rs_failed = success ? 0 : count; - /* we don't keep a persistent log of the resync lru, - * we can commit any change right away. */ - lc_committed(mdev->resync); + lc_changed(mdev->resync, &ext->lce); } lc_put(mdev->resync, &ext->lce); /* no race, we are within the al_lock! */ @@ -643,8 +767,7 @@ static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector, if (udw) { udw->enr = ext->lce.lc_number; udw->w.cb = w_update_odbm; - udw->w.mdev = mdev; - drbd_queue_work_front(&mdev->tconn->sender_work, &udw->w); + drbd_queue_work_front(&mdev->data.work, &udw->w); } else { dev_warn(DEV, "Could not kmalloc an udw\n"); } @@ -690,22 +813,16 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size, int wake_up = 0; unsigned long flags; - if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) { + if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) { dev_err(DEV, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n", (unsigned long long)sector, size); return; } - - if (!get_ldev(mdev)) - return; /* no disk, no metadata, no bitmap to clear bits in */ - nr_sectors = drbd_get_capacity(mdev->this_bdev); esector = sector + (size >> 9) - 1; - if (!expect(sector < nr_sectors)) - goto out; - if (!expect(esector < nr_sectors)) - esector = nr_sectors - 1; + ERR_IF(sector >= nr_sectors) return; + ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1); lbnr = BM_SECT_TO_BIT(nr_sectors-1); @@ -713,7 +830,7 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size, * round up start sector, round down end sector. we make sure we only * clear full, aligned, BM_BLOCK_SIZE (4K) blocks */ if (unlikely(esector < BM_SECT_PER_BIT-1)) - goto out; + return; if (unlikely(esector == (nr_sectors-1))) ebnr = lbnr; else @@ -721,14 +838,14 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size, sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1); if (sbnr > ebnr) - goto out; + return; /* * ok, (capacity & 7) != 0 sometimes, but who cares... * we count rs_{total,left} in bits, not sectors. */ count = drbd_bm_clear_bits(mdev, sbnr, ebnr); - if (count) { + if (count && get_ldev(mdev)) { drbd_advance_rs_marks(mdev, drbd_bm_total_weight(mdev)); spin_lock_irqsave(&mdev->al_lock, flags); drbd_try_clear_on_disk_bm(mdev, sector, count, true); @@ -737,9 +854,8 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size, /* just wake_up unconditional now, various lc_chaged(), * lc_put() in drbd_try_clear_on_disk_bm(). */ wake_up = 1; + put_ldev(mdev); } -out: - put_ldev(mdev); if (wake_up) wake_up(&mdev->al_wait); } @@ -755,7 +871,7 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size, int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size, const char *file, const unsigned int line) { - unsigned long sbnr, ebnr, flags; + unsigned long sbnr, ebnr, lbnr, flags; sector_t esector, nr_sectors; unsigned int enr, count = 0; struct lc_element *e; @@ -764,7 +880,7 @@ int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size, if (size == 0) return 0; - if (size < 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) { + if (size < 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) { dev_err(DEV, "sector: %llus, size: %d\n", (unsigned long long)sector, size); return 0; @@ -776,10 +892,12 @@ int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size, nr_sectors = drbd_get_capacity(mdev->this_bdev); esector = sector + (size >> 9) - 1; - if (!expect(sector < nr_sectors)) + ERR_IF(sector >= nr_sectors) goto out; - if (!expect(esector < nr_sectors)) - esector = nr_sectors - 1; + ERR_IF(esector >= nr_sectors) + esector = (nr_sectors-1); + + lbnr = BM_SECT_TO_BIT(nr_sectors-1); /* we set it out of sync, * we do not need to round anything here */ @@ -822,7 +940,7 @@ struct bm_extent *_bme_get(struct drbd_conf *mdev, unsigned int enr) if (bm_ext->lce.lc_number != enr) { bm_ext->rs_left = drbd_bm_e_weight(mdev, enr); bm_ext->rs_failed = 0; - lc_committed(mdev->resync); + lc_changed(mdev->resync, &bm_ext->lce); wakeup = 1; } if (bm_ext->lce.refcnt == 1) @@ -838,7 +956,7 @@ struct bm_extent *_bme_get(struct drbd_conf *mdev, unsigned int enr) if (rs_flags & LC_STARVING) dev_warn(DEV, "Have to wait for element" " (resync LRU too small?)\n"); - BUG_ON(rs_flags & LC_LOCKED); + BUG_ON(rs_flags & LC_DIRTY); } return bm_ext; @@ -846,12 +964,26 @@ struct bm_extent *_bme_get(struct drbd_conf *mdev, unsigned int enr) static int _is_in_al(struct drbd_conf *mdev, unsigned int enr) { - int rv; + struct lc_element *al_ext; + int rv = 0; spin_lock_irq(&mdev->al_lock); - rv = lc_is_used(mdev->act_log, enr); + if (unlikely(enr == mdev->act_log->new_number)) + rv = 1; + else { + al_ext = lc_find(mdev->act_log, enr); + if (al_ext) { + if (al_ext->refcnt) + rv = 1; + } + } spin_unlock_irq(&mdev->al_lock); + /* + if (unlikely(rv)) { + dev_info(DEV, "Delaying sync read until app's write is done\n"); + } + */ return rv; } @@ -981,13 +1113,13 @@ int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector) if (rs_flags & LC_STARVING) dev_warn(DEV, "Have to wait for element" " (resync LRU too small?)\n"); - BUG_ON(rs_flags & LC_LOCKED); + BUG_ON(rs_flags & LC_DIRTY); goto try_again; } if (bm_ext->lce.lc_number != enr) { bm_ext->rs_left = drbd_bm_e_weight(mdev, enr); bm_ext->rs_failed = 0; - lc_committed(mdev->resync); + lc_changed(mdev->resync, &bm_ext->lce); wake_up(&mdev->al_wait); D_ASSERT(test_bit(BME_LOCKED, &bm_ext->flags) == 0); } @@ -998,6 +1130,8 @@ int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector) } check_al: for (i = 0; i < AL_EXT_PER_BM_SECT; i++) { + if (unlikely(al_enr+i == mdev->act_log->new_number)) + goto try_again; if (lc_is_used(mdev->act_log, al_enr+i)) goto try_again; } @@ -1132,7 +1266,7 @@ void drbd_rs_failed_io(struct drbd_conf *mdev, sector_t sector, int size) sector_t esector, nr_sectors; int wake_up = 0; - if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) { + if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) { dev_err(DEV, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n", (unsigned long long)sector, size); return; @@ -1140,10 +1274,8 @@ void drbd_rs_failed_io(struct drbd_conf *mdev, sector_t sector, int size) nr_sectors = drbd_get_capacity(mdev->this_bdev); esector = sector + (size >> 9) - 1; - if (!expect(sector < nr_sectors)) - return; - if (!expect(esector < nr_sectors)) - esector = nr_sectors - 1; + ERR_IF(sector >= nr_sectors) return; + ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1); lbnr = BM_SECT_TO_BIT(nr_sectors-1); diff --git a/trunk/drivers/block/drbd/drbd_bitmap.c b/trunk/drivers/block/drbd/drbd_bitmap.c index 8dc29502dc08..d84566496746 100644 --- a/trunk/drivers/block/drbd/drbd_bitmap.c +++ b/trunk/drivers/block/drbd/drbd_bitmap.c @@ -119,9 +119,13 @@ static void __bm_print_lock_info(struct drbd_conf *mdev, const char *func) if (!__ratelimit(&drbd_ratelimit_state)) return; dev_err(DEV, "FIXME %s in %s, bitmap locked for '%s' by %s\n", - drbd_task_to_thread_name(mdev->tconn, current), - func, b->bm_why ?: "?", - drbd_task_to_thread_name(mdev->tconn, b->bm_task)); + current == mdev->receiver.task ? "receiver" : + current == mdev->asender.task ? "asender" : + current == mdev->worker.task ? "worker" : current->comm, + func, b->bm_why ?: "?", + b->bm_task == mdev->receiver.task ? "receiver" : + b->bm_task == mdev->asender.task ? "asender" : + b->bm_task == mdev->worker.task ? "worker" : "?"); } void drbd_bm_lock(struct drbd_conf *mdev, char *why, enum bm_flag flags) @@ -138,9 +142,13 @@ void drbd_bm_lock(struct drbd_conf *mdev, char *why, enum bm_flag flags) if (trylock_failed) { dev_warn(DEV, "%s going to '%s' but bitmap already locked for '%s' by %s\n", - drbd_task_to_thread_name(mdev->tconn, current), - why, b->bm_why ?: "?", - drbd_task_to_thread_name(mdev->tconn, b->bm_task)); + current == mdev->receiver.task ? "receiver" : + current == mdev->asender.task ? "asender" : + current == mdev->worker.task ? "worker" : current->comm, + why, b->bm_why ?: "?", + b->bm_task == mdev->receiver.task ? "receiver" : + b->bm_task == mdev->asender.task ? "asender" : + b->bm_task == mdev->worker.task ? "worker" : "?"); mutex_lock(&b->bm_change); } if (BM_LOCKED_MASK & b->bm_flags) @@ -188,9 +196,6 @@ void drbd_bm_unlock(struct drbd_conf *mdev) /* to mark for lazy writeout once syncer cleared all clearable bits, * we if bits have been cleared since last IO. */ #define BM_PAGE_LAZY_WRITEOUT 28 -/* pages marked with this "HINT" will be considered for writeout - * on activity log transactions */ -#define BM_PAGE_HINT_WRITEOUT 27 /* store_page_idx uses non-atomic assignment. It is only used directly after * allocating the page. All other bm_set_page_* and bm_clear_page_* need to @@ -222,7 +227,8 @@ static void bm_page_unlock_io(struct drbd_conf *mdev, int page_nr) { struct drbd_bitmap *b = mdev->bitmap; void *addr = &page_private(b->bm_pages[page_nr]); - clear_bit_unlock(BM_PAGE_IO_LOCK, addr); + clear_bit(BM_PAGE_IO_LOCK, addr); + smp_mb__after_clear_bit(); wake_up(&mdev->bitmap->bm_io_wait); } @@ -240,27 +246,6 @@ static void bm_set_page_need_writeout(struct page *page) set_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page)); } -/** - * drbd_bm_mark_for_writeout() - mark a page with a "hint" to be considered for writeout - * @mdev: DRBD device. - * @page_nr: the bitmap page to mark with the "hint" flag - * - * From within an activity log transaction, we mark a few pages with these - * hints, then call drbd_bm_write_hinted(), which will only write out changed - * pages which are flagged with this mark. - */ -void drbd_bm_mark_for_writeout(struct drbd_conf *mdev, int page_nr) -{ - struct page *page; - if (page_nr >= mdev->bitmap->bm_number_of_pages) { - dev_warn(DEV, "BAD: page_nr: %u, number_of_pages: %u\n", - page_nr, (int)mdev->bitmap->bm_number_of_pages); - return; - } - page = mdev->bitmap->bm_pages[page_nr]; - set_bit(BM_PAGE_HINT_WRITEOUT, &page_private(page)); -} - static int bm_test_page_unchanged(struct page *page) { volatile const unsigned long *addr = &page_private(page); @@ -388,16 +373,14 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want) return old_pages; /* Trying kmalloc first, falling back to vmalloc. - * GFP_NOIO, as this is called while drbd IO is "suspended", - * and during resize or attach on diskless Primary, - * we must not block on IO to ourselves. - * Context is receiver thread or dmsetup. */ + * GFP_KERNEL is ok, as this is done when a lower level disk is + * "attached" to the drbd. Context is receiver thread or cqueue + * thread. As we have no disk yet, we are not in the IO path, + * not even the IO path of the peer. */ bytes = sizeof(struct page *)*want; - new_pages = kzalloc(bytes, GFP_NOIO); + new_pages = kzalloc(bytes, GFP_KERNEL); if (!new_pages) { - new_pages = __vmalloc(bytes, - GFP_NOIO | __GFP_HIGHMEM | __GFP_ZERO, - PAGE_KERNEL); + new_pages = vzalloc(bytes); if (!new_pages) return NULL; vmalloced = 1; @@ -407,7 +390,7 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want) for (i = 0; i < have; i++) new_pages[i] = old_pages[i]; for (; i < want; i++) { - page = alloc_page(GFP_NOIO | __GFP_HIGHMEM); + page = alloc_page(GFP_HIGHUSER); if (!page) { bm_free_pages(new_pages + have, i - have); bm_vk_free(new_pages, vmalloced); @@ -456,8 +439,7 @@ int drbd_bm_init(struct drbd_conf *mdev) sector_t drbd_bm_capacity(struct drbd_conf *mdev) { - if (!expect(mdev->bitmap)) - return 0; + ERR_IF(!mdev->bitmap) return 0; return mdev->bitmap->bm_dev_capacity; } @@ -465,8 +447,7 @@ sector_t drbd_bm_capacity(struct drbd_conf *mdev) */ void drbd_bm_cleanup(struct drbd_conf *mdev) { - if (!expect(mdev->bitmap)) - return; + ERR_IF (!mdev->bitmap) return; bm_free_pages(mdev->bitmap->bm_pages, mdev->bitmap->bm_number_of_pages); bm_vk_free(mdev->bitmap->bm_pages, (BM_P_VMALLOCED & mdev->bitmap->bm_flags)); kfree(mdev->bitmap); @@ -629,8 +610,7 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits) int err = 0, growing; int opages_vmalloced; - if (!expect(b)) - return -ENOMEM; + ERR_IF(!b) return -ENOMEM; drbd_bm_lock(mdev, "resize", BM_LOCKED_MASK); @@ -752,10 +732,8 @@ unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev) unsigned long s; unsigned long flags; - if (!expect(b)) - return 0; - if (!expect(b->bm_pages)) - return 0; + ERR_IF(!b) return 0; + ERR_IF(!b->bm_pages) return 0; spin_lock_irqsave(&b->bm_lock, flags); s = b->bm_set; @@ -778,10 +756,8 @@ unsigned long drbd_bm_total_weight(struct drbd_conf *mdev) size_t drbd_bm_words(struct drbd_conf *mdev) { struct drbd_bitmap *b = mdev->bitmap; - if (!expect(b)) - return 0; - if (!expect(b->bm_pages)) - return 0; + ERR_IF(!b) return 0; + ERR_IF(!b->bm_pages) return 0; return b->bm_words; } @@ -789,8 +765,7 @@ size_t drbd_bm_words(struct drbd_conf *mdev) unsigned long drbd_bm_bits(struct drbd_conf *mdev) { struct drbd_bitmap *b = mdev->bitmap; - if (!expect(b)) - return 0; + ERR_IF(!b) return 0; return b->bm_bits; } @@ -811,10 +786,8 @@ void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, size_t number, end = offset + number; - if (!expect(b)) - return; - if (!expect(b->bm_pages)) - return; + ERR_IF(!b) return; + ERR_IF(!b->bm_pages) return; if (number == 0) return; WARN_ON(offset >= b->bm_words); @@ -858,10 +831,8 @@ void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset, size_t number, end = offset + number; - if (!expect(b)) - return; - if (!expect(b->bm_pages)) - return; + ERR_IF(!b) return; + ERR_IF(!b->bm_pages) return; spin_lock_irq(&b->bm_lock); if ((offset >= b->bm_words) || @@ -889,10 +860,8 @@ void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset, size_t number, void drbd_bm_set_all(struct drbd_conf *mdev) { struct drbd_bitmap *b = mdev->bitmap; - if (!expect(b)) - return; - if (!expect(b->bm_pages)) - return; + ERR_IF(!b) return; + ERR_IF(!b->bm_pages) return; spin_lock_irq(&b->bm_lock); bm_memset(b, 0, 0xff, b->bm_words); @@ -905,10 +874,8 @@ void drbd_bm_set_all(struct drbd_conf *mdev) void drbd_bm_clear_all(struct drbd_conf *mdev) { struct drbd_bitmap *b = mdev->bitmap; - if (!expect(b)) - return; - if (!expect(b->bm_pages)) - return; + ERR_IF(!b) return; + ERR_IF(!b->bm_pages) return; spin_lock_irq(&b->bm_lock); bm_memset(b, 0, 0, b->bm_words); @@ -922,8 +889,7 @@ struct bm_aio_ctx { unsigned int done; unsigned flags; #define BM_AIO_COPY_PAGES 1 -#define BM_AIO_WRITE_HINTED 2 -#define BM_WRITE_ALL_PAGES 4 +#define BM_WRITE_ALL_PAGES 2 int error; struct kref kref; }; @@ -1011,11 +977,17 @@ static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must bm_set_page_unchanged(b->bm_pages[page_nr]); if (ctx->flags & BM_AIO_COPY_PAGES) { + void *src, *dest; page = mempool_alloc(drbd_md_io_page_pool, __GFP_HIGHMEM|__GFP_WAIT); - copy_highpage(page, b->bm_pages[page_nr]); + dest = kmap_atomic(page); + src = kmap_atomic(b->bm_pages[page_nr]); + memcpy(dest, src, PAGE_SIZE); + kunmap_atomic(src); + kunmap_atomic(dest); bm_store_page_idx(page, page_nr); } else page = b->bm_pages[page_nr]; + bio->bi_bdev = mdev->ldev->md_bdev; bio->bi_sector = on_disk_sector; /* bio_add_page of a single page to an empty bio will always succeed, @@ -1088,11 +1060,6 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_w if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx) break; if (rw & WRITE) { - if ((flags & BM_AIO_WRITE_HINTED) && - !test_and_clear_bit(BM_PAGE_HINT_WRITEOUT, - &page_private(b->bm_pages[i]))) - continue; - if (!(flags & BM_WRITE_ALL_PAGES) && bm_test_page_unchanged(b->bm_pages[i])) { dynamic_dev_dbg(DEV, "skipped bm write for idx %u\n", i); @@ -1121,15 +1088,13 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_w * "in_flight reached zero, all done" event. */ if (!atomic_dec_and_test(&ctx->in_flight)) - wait_until_done_or_force_detached(mdev, mdev->ldev, &ctx->done); + wait_until_done_or_disk_failure(mdev, mdev->ldev, &ctx->done); else kref_put(&ctx->kref, &bm_aio_ctx_destroy); - /* summary for global bitmap IO */ - if (flags == 0) - dev_info(DEV, "bitmap %s of %u pages took %lu jiffies\n", - rw == WRITE ? "WRITE" : "READ", - count, jiffies - now); + dev_info(DEV, "bitmap %s of %u pages took %lu jiffies\n", + rw == WRITE ? "WRITE" : "READ", + count, jiffies - now); if (ctx->error) { dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n"); @@ -1138,7 +1103,7 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_w } if (atomic_read(&ctx->in_flight)) - err = -EIO; /* Disk timeout/force-detach during IO... */ + err = -EIO; /* Disk failed during IO... */ now = jiffies; if (rw == WRITE) { @@ -1150,9 +1115,8 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_w } now = b->bm_set; - if (flags == 0) - dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n", - ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now); + dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n", + ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now); kref_put(&ctx->kref, &bm_aio_ctx_destroy); return err; @@ -1215,17 +1179,9 @@ int drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local) return bm_rw(mdev, WRITE, BM_AIO_COPY_PAGES, 0); } -/** - * drbd_bm_write_hinted() - Write bitmap pages with "hint" marks, if they have changed. - * @mdev: DRBD device. - */ -int drbd_bm_write_hinted(struct drbd_conf *mdev) __must_hold(local) -{ - return bm_rw(mdev, WRITE, BM_AIO_WRITE_HINTED | BM_AIO_COPY_PAGES, 0); -} /** - * drbd_bm_write_page() - Writes a PAGE_SIZE aligned piece of bitmap + * drbd_bm_write_page: Writes a PAGE_SIZE aligned piece of bitmap * @mdev: DRBD device. * @idx: bitmap page index * @@ -1266,11 +1222,11 @@ int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(loc } bm_page_io_async(ctx, idx, WRITE_SYNC); - wait_until_done_or_force_detached(mdev, mdev->ldev, &ctx->done); + wait_until_done_or_disk_failure(mdev, mdev->ldev, &ctx->done); if (ctx->error) drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR); - /* that causes us to detach, so the in memory bitmap will be + /* that should force detach, so the in memory bitmap will be * gone in a moment as well. */ mdev->bm_writ_cnt++; @@ -1333,10 +1289,8 @@ static unsigned long bm_find_next(struct drbd_conf *mdev, struct drbd_bitmap *b = mdev->bitmap; unsigned long i = DRBD_END_OF_BITMAP; - if (!expect(b)) - return i; - if (!expect(b->bm_pages)) - return i; + ERR_IF(!b) return i; + ERR_IF(!b->bm_pages) return i; spin_lock_irq(&b->bm_lock); if (BM_DONT_TEST & b->bm_flags) @@ -1437,10 +1391,8 @@ static int bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, struct drbd_bitmap *b = mdev->bitmap; int c = 0; - if (!expect(b)) - return 1; - if (!expect(b->bm_pages)) - return 0; + ERR_IF(!b) return 1; + ERR_IF(!b->bm_pages) return 0; spin_lock_irqsave(&b->bm_lock, flags); if ((val ? BM_DONT_SET : BM_DONT_CLEAR) & b->bm_flags) @@ -1471,21 +1423,13 @@ static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b, { int i; int bits; - int changed = 0; unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr]); for (i = first_word; i < last_word; i++) { bits = hweight_long(paddr[i]); paddr[i] = ~0UL; - changed += BITS_PER_LONG - bits; + b->bm_set += BITS_PER_LONG - bits; } kunmap_atomic(paddr); - if (changed) { - /* We only need lazy writeout, the information is still in the - * remote bitmap as well, and is reconstructed during the next - * bitmap exchange, if lost locally due to a crash. */ - bm_set_page_lazy_writeout(b->bm_pages[page_nr]); - b->bm_set += changed; - } } /* Same thing as drbd_bm_set_bits, @@ -1580,10 +1524,8 @@ int drbd_bm_test_bit(struct drbd_conf *mdev, const unsigned long bitnr) unsigned long *p_addr; int i; - if (!expect(b)) - return 0; - if (!expect(b->bm_pages)) - return 0; + ERR_IF(!b) return 0; + ERR_IF(!b->bm_pages) return 0; spin_lock_irqsave(&b->bm_lock, flags); if (BM_DONT_TEST & b->bm_flags) @@ -1617,10 +1559,8 @@ int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsi * robust in case we screwed up elsewhere, in that case pretend there * was one dirty bit in the requested area, so we won't try to do a * local read there (no bitmap probably implies no disk) */ - if (!expect(b)) - return 1; - if (!expect(b->bm_pages)) - return 1; + ERR_IF(!b) return 1; + ERR_IF(!b->bm_pages) return 1; spin_lock_irqsave(&b->bm_lock, flags); if (BM_DONT_TEST & b->bm_flags) @@ -1633,10 +1573,11 @@ int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsi bm_unmap(p_addr); p_addr = bm_map_pidx(b, idx); } - if (expect(bitnr < b->bm_bits)) - c += (0 != test_bit_le(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr)); - else + ERR_IF (bitnr >= b->bm_bits) { dev_err(DEV, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits); + } else { + c += (0 != test_bit_le(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr)); + } } if (p_addr) bm_unmap(p_addr); @@ -1666,10 +1607,8 @@ int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr) unsigned long flags; unsigned long *p_addr, *bm; - if (!expect(b)) - return 0; - if (!expect(b->bm_pages)) - return 0; + ERR_IF(!b) return 0; + ERR_IF(!b->bm_pages) return 0; spin_lock_irqsave(&b->bm_lock, flags); if (BM_DONT_TEST & b->bm_flags) @@ -1691,3 +1630,47 @@ int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr) spin_unlock_irqrestore(&b->bm_lock, flags); return count; } + +/* Set all bits covered by the AL-extent al_enr. + * Returns number of bits changed. */ +unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, unsigned long al_enr) +{ + struct drbd_bitmap *b = mdev->bitmap; + unsigned long *p_addr, *bm; + unsigned long weight; + unsigned long s, e; + int count, i, do_now; + ERR_IF(!b) return 0; + ERR_IF(!b->bm_pages) return 0; + + spin_lock_irq(&b->bm_lock); + if (BM_DONT_SET & b->bm_flags) + bm_print_lock_info(mdev); + weight = b->bm_set; + + s = al_enr * BM_WORDS_PER_AL_EXT; + e = min_t(size_t, s + BM_WORDS_PER_AL_EXT, b->bm_words); + /* assert that s and e are on the same page */ + D_ASSERT((e-1) >> (PAGE_SHIFT - LN2_BPL + 3) + == s >> (PAGE_SHIFT - LN2_BPL + 3)); + count = 0; + if (s < b->bm_words) { + i = do_now = e-s; + p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s)); + bm = p_addr + MLPP(s); + while (i--) { + count += hweight_long(*bm); + *bm = -1UL; + bm++; + } + bm_unmap(p_addr); + b->bm_set += do_now*BITS_PER_LONG - count; + if (e == b->bm_words) + b->bm_set -= bm_clear_surplus(b); + } else { + dev_err(DEV, "start offset (%lu) too large in drbd_bm_ALe_set_all\n", s); + } + weight = b->bm_set - weight; + spin_unlock_irq(&b->bm_lock); + return weight; +} diff --git a/trunk/drivers/block/drbd/drbd_int.h b/trunk/drivers/block/drbd/drbd_int.h index 6b51afa1aae1..b953cc7c9c00 100644 --- a/trunk/drivers/block/drbd/drbd_int.h +++ b/trunk/drivers/block/drbd/drbd_int.h @@ -39,13 +39,9 @@ #include #include #include -#include #include #include #include -#include -#include -#include "drbd_state.h" #ifdef __CHECKER__ # define __protected_by(x) __attribute__((require_context(x,1,999,"rdwr"))) @@ -65,6 +61,7 @@ extern unsigned int minor_count; extern bool disable_sendpage; extern bool allow_oos; +extern unsigned int cn_idx; #ifdef CONFIG_DRBD_FAULT_INJECTION extern int enable_faults; @@ -89,44 +86,34 @@ extern char usermode_helper[]; */ #define DRBD_SIGKILL SIGHUP +/* All EEs on the free list should have ID_VACANT (== 0) + * freshly allocated EEs get !ID_VACANT (== 1) + * so if it says "cannot dereference null pointer at address 0x00000001", + * it is most likely one of these :( */ + #define ID_IN_SYNC (4711ULL) #define ID_OUT_OF_SYNC (4712ULL) -#define ID_SYNCER (-1ULL) +#define ID_SYNCER (-1ULL) +#define ID_VACANT 0 +#define is_syncer_block_id(id) ((id) == ID_SYNCER) #define UUID_NEW_BM_OFFSET ((u64)0x0001000000000000ULL) struct drbd_conf; -struct drbd_tconn; /* to shorten dev_warn(DEV, "msg"); and relatives statements */ #define DEV (disk_to_dev(mdev->vdisk)) -#define conn_printk(LEVEL, TCONN, FMT, ARGS...) \ - printk(LEVEL "d-con %s: " FMT, TCONN->name , ## ARGS) -#define conn_alert(TCONN, FMT, ARGS...) conn_printk(KERN_ALERT, TCONN, FMT, ## ARGS) -#define conn_crit(TCONN, FMT, ARGS...) conn_printk(KERN_CRIT, TCONN, FMT, ## ARGS) -#define conn_err(TCONN, FMT, ARGS...) conn_printk(KERN_ERR, TCONN, FMT, ## ARGS) -#define conn_warn(TCONN, FMT, ARGS...) conn_printk(KERN_WARNING, TCONN, FMT, ## ARGS) -#define conn_notice(TCONN, FMT, ARGS...) conn_printk(KERN_NOTICE, TCONN, FMT, ## ARGS) -#define conn_info(TCONN, FMT, ARGS...) conn_printk(KERN_INFO, TCONN, FMT, ## ARGS) -#define conn_dbg(TCONN, FMT, ARGS...) conn_printk(KERN_DEBUG, TCONN, FMT, ## ARGS) - #define D_ASSERT(exp) if (!(exp)) \ dev_err(DEV, "ASSERT( " #exp " ) in %s:%d\n", __FILE__, __LINE__) -/** - * expect - Make an assertion - * - * Unlike the assert macro, this macro returns a boolean result. - */ -#define expect(exp) ({ \ - bool _bool = (exp); \ - if (!_bool) \ - dev_err(DEV, "ASSERTION %s FAILED in %s\n", \ - #exp, __func__); \ - _bool; \ - }) +#define ERR_IF(exp) if (({ \ + int _b = (exp) != 0; \ + if (_b) dev_err(DEV, "ASSERT FAILED: %s: (%s) in %s:%d\n", \ + __func__, #exp, __FILE__, __LINE__); \ + _b; \ + })) /* Defines to control fault insertion */ enum { @@ -163,12 +150,15 @@ drbd_insert_fault(struct drbd_conf *mdev, unsigned int type) { /* usual integer division */ #define div_floor(A, B) ((A)/(B)) +/* drbd_meta-data.c (still in drbd_main.c) */ +/* 4th incarnation of the disk layout. */ +#define DRBD_MD_MAGIC (DRBD_MAGIC+4) + +extern struct drbd_conf **minor_table; extern struct ratelimit_state drbd_ratelimit_state; -extern struct idr minors; /* RCU, updates: genl_lock() */ -extern struct list_head drbd_tconns; /* RCU, updates: genl_lock() */ /* on the wire */ -enum drbd_packet { +enum drbd_packets { /* receiver (data socket) */ P_DATA = 0x00, P_DATA_REPLY = 0x01, /* Response to P_DATA_REQUEST */ @@ -196,7 +186,7 @@ enum drbd_packet { P_RECV_ACK = 0x15, /* Used in protocol B */ P_WRITE_ACK = 0x16, /* Used in protocol C */ P_RS_WRITE_ACK = 0x17, /* Is a P_WRITE_ACK, additionally call set_in_sync(). */ - P_SUPERSEDED = 0x18, /* Used in proto C, two-primaries conflict detection */ + P_DISCARD_ACK = 0x18, /* Used in proto C, two-primaries conflict detection */ P_NEG_ACK = 0x19, /* Sent if local disk is unusable */ P_NEG_DREPLY = 0x1a, /* Local disk is broken... */ P_NEG_RS_DREPLY = 0x1b, /* Local disk is broken... */ @@ -217,23 +207,77 @@ enum drbd_packet { P_DELAY_PROBE = 0x27, /* is used on BOTH sockets */ P_OUT_OF_SYNC = 0x28, /* Mark as out of sync (Outrunning), data socket */ P_RS_CANCEL = 0x29, /* meta: Used to cancel RS_DATA_REQUEST packet by SyncSource */ - P_CONN_ST_CHG_REQ = 0x2a, /* data sock: Connection wide state request */ - P_CONN_ST_CHG_REPLY = 0x2b, /* meta sock: Connection side state req reply */ - P_RETRY_WRITE = 0x2c, /* Protocol C: retry conflicting write request */ - P_PROTOCOL_UPDATE = 0x2d, /* data sock: is used in established connections */ + P_MAX_CMD = 0x2A, P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */ P_MAX_OPT_CMD = 0x101, /* special command ids for handshake */ - P_INITIAL_META = 0xfff1, /* First Packet on the MetaSock */ - P_INITIAL_DATA = 0xfff2, /* First Packet on the Socket */ + P_HAND_SHAKE_M = 0xfff1, /* First Packet on the MetaSock */ + P_HAND_SHAKE_S = 0xfff2, /* First Packet on the Socket */ - P_CONNECTION_FEATURES = 0xfffe /* FIXED for the next century! */ + P_HAND_SHAKE = 0xfffe /* FIXED for the next century! */ }; -extern const char *cmdname(enum drbd_packet cmd); +static inline const char *cmdname(enum drbd_packets cmd) +{ + /* THINK may need to become several global tables + * when we want to support more than + * one PRO_VERSION */ + static const char *cmdnames[] = { + [P_DATA] = "Data", + [P_DATA_REPLY] = "DataReply", + [P_RS_DATA_REPLY] = "RSDataReply", + [P_BARRIER] = "Barrier", + [P_BITMAP] = "ReportBitMap", + [P_BECOME_SYNC_TARGET] = "BecomeSyncTarget", + [P_BECOME_SYNC_SOURCE] = "BecomeSyncSource", + [P_UNPLUG_REMOTE] = "UnplugRemote", + [P_DATA_REQUEST] = "DataRequest", + [P_RS_DATA_REQUEST] = "RSDataRequest", + [P_SYNC_PARAM] = "SyncParam", + [P_SYNC_PARAM89] = "SyncParam89", + [P_PROTOCOL] = "ReportProtocol", + [P_UUIDS] = "ReportUUIDs", + [P_SIZES] = "ReportSizes", + [P_STATE] = "ReportState", + [P_SYNC_UUID] = "ReportSyncUUID", + [P_AUTH_CHALLENGE] = "AuthChallenge", + [P_AUTH_RESPONSE] = "AuthResponse", + [P_PING] = "Ping", + [P_PING_ACK] = "PingAck", + [P_RECV_ACK] = "RecvAck", + [P_WRITE_ACK] = "WriteAck", + [P_RS_WRITE_ACK] = "RSWriteAck", + [P_DISCARD_ACK] = "DiscardAck", + [P_NEG_ACK] = "NegAck", + [P_NEG_DREPLY] = "NegDReply", + [P_NEG_RS_DREPLY] = "NegRSDReply", + [P_BARRIER_ACK] = "BarrierAck", + [P_STATE_CHG_REQ] = "StateChgRequest", + [P_STATE_CHG_REPLY] = "StateChgReply", + [P_OV_REQUEST] = "OVRequest", + [P_OV_REPLY] = "OVReply", + [P_OV_RESULT] = "OVResult", + [P_CSUM_RS_REQUEST] = "CsumRSRequest", + [P_RS_IS_IN_SYNC] = "CsumRSIsInSync", + [P_COMPRESSED_BITMAP] = "CBitmap", + [P_DELAY_PROBE] = "DelayProbe", + [P_OUT_OF_SYNC] = "OutOfSync", + [P_MAX_CMD] = NULL, + }; + + if (cmd == P_HAND_SHAKE_M) + return "HandShakeM"; + if (cmd == P_HAND_SHAKE_S) + return "HandShakeS"; + if (cmd == P_HAND_SHAKE) + return "HandShake"; + if (cmd >= P_MAX_CMD) + return "Unknown"; + return cmdnames[cmd]; +} /* for sending/receiving the bitmap, * possibly in some encoding scheme */ @@ -293,24 +337,37 @@ struct p_header80 { u32 magic; u16 command; u16 length; /* bytes of data after this header */ + u8 payload[0]; } __packed; /* Header for big packets, Used for data packets exceeding 64kB */ struct p_header95 { u16 magic; /* use DRBD_MAGIC_BIG here */ u16 command; - u32 length; + u32 length; /* Use only 24 bits of that. Ignore the highest 8 bit. */ + u8 payload[0]; } __packed; -struct p_header100 { - u32 magic; - u16 volume; - u16 command; - u32 length; - u32 pad; -} __packed; +union p_header { + struct p_header80 h80; + struct p_header95 h95; +}; -extern unsigned int drbd_header_size(struct drbd_tconn *tconn); +/* + * short commands, packets without payload, plain p_header: + * P_PING + * P_PING_ACK + * P_BECOME_SYNC_TARGET + * P_BECOME_SYNC_SOURCE + * P_UNPLUG_REMOTE + */ + +/* + * commands with out-of-struct payload: + * P_BITMAP (no additional fields) + * P_DATA, P_DATA_REPLY (see p_data) + * P_COMPRESSED_BITMAP (see receive_compressed_bitmap) + */ /* these defines must not be changed without changing the protocol version */ #define DP_HARDBARRIER 1 /* depricated */ @@ -320,10 +377,9 @@ extern unsigned int drbd_header_size(struct drbd_tconn *tconn); #define DP_FUA 16 /* equals REQ_FUA */ #define DP_FLUSH 32 /* equals REQ_FLUSH */ #define DP_DISCARD 64 /* equals REQ_DISCARD */ -#define DP_SEND_RECEIVE_ACK 128 /* This is a proto B write request */ -#define DP_SEND_WRITE_ACK 256 /* This is a proto C write request */ struct p_data { + union p_header head; u64 sector; /* 64 bits sector number */ u64 block_id; /* to identify the request in protocol B&C */ u32 seq_num; @@ -334,18 +390,21 @@ struct p_data { * commands which share a struct: * p_block_ack: * P_RECV_ACK (proto B), P_WRITE_ACK (proto C), - * P_SUPERSEDED (proto C, two-primaries conflict detection) + * P_DISCARD_ACK (proto C, two-primaries conflict detection) * p_block_req: * P_DATA_REQUEST, P_RS_DATA_REQUEST */ struct p_block_ack { + struct p_header80 head; u64 sector; u64 block_id; u32 blksize; u32 seq_num; } __packed; + struct p_block_req { + struct p_header80 head; u64 sector; u64 block_id; u32 blksize; @@ -354,52 +413,59 @@ struct p_block_req { /* * commands with their own struct for additional fields: - * P_CONNECTION_FEATURES + * P_HAND_SHAKE * P_BARRIER * P_BARRIER_ACK * P_SYNC_PARAM * ReportParams */ -struct p_connection_features { +struct p_handshake { + struct p_header80 head; /* 8 bytes */ u32 protocol_min; u32 feature_flags; u32 protocol_max; /* should be more than enough for future enhancements - * for now, feature_flags and the reserved array shall be zero. + * for now, feature_flags and the reserverd array shall be zero. */ u32 _pad; - u64 reserved[7]; + u64 reserverd[7]; } __packed; +/* 80 bytes, FIXED for the next century */ struct p_barrier { + struct p_header80 head; u32 barrier; /* barrier number _handle_ only */ u32 pad; /* to multiple of 8 Byte */ } __packed; struct p_barrier_ack { + struct p_header80 head; u32 barrier; u32 set_size; } __packed; struct p_rs_param { - u32 resync_rate; + struct p_header80 head; + u32 rate; /* Since protocol version 88 and higher. */ char verify_alg[0]; } __packed; struct p_rs_param_89 { - u32 resync_rate; + struct p_header80 head; + u32 rate; /* protocol version 89: */ char verify_alg[SHARED_SECRET_MAX]; char csums_alg[SHARED_SECRET_MAX]; } __packed; struct p_rs_param_95 { - u32 resync_rate; + struct p_header80 head; + u32 rate; char verify_alg[SHARED_SECRET_MAX]; char csums_alg[SHARED_SECRET_MAX]; u32 c_plan_ahead; @@ -409,11 +475,12 @@ struct p_rs_param_95 { } __packed; enum drbd_conn_flags { - CF_DISCARD_MY_DATA = 1, + CF_WANT_LOSE = 1, CF_DRY_RUN = 2, }; struct p_protocol { + struct p_header80 head; u32 protocol; u32 after_sb_0p; u32 after_sb_1p; @@ -427,14 +494,17 @@ struct p_protocol { } __packed; struct p_uuids { + struct p_header80 head; u64 uuid[UI_EXTENDED_SIZE]; } __packed; struct p_rs_uuid { + struct p_header80 head; u64 uuid; } __packed; struct p_sizes { + struct p_header80 head; u64 d_size; /* size of disk */ u64 u_size; /* user requested size */ u64 c_size; /* current exported size */ @@ -444,15 +514,18 @@ struct p_sizes { } __packed; struct p_state { + struct p_header80 head; u32 state; } __packed; struct p_req_state { + struct p_header80 head; u32 mask; u32 val; } __packed; struct p_req_state_reply { + struct p_header80 head; u32 retcode; } __packed; @@ -466,7 +539,15 @@ struct p_drbd06_param { u32 bit_map_gen[5]; } __packed; +struct p_discard { + struct p_header80 head; + u64 block_id; + u32 seq_num; + u32 pad; +} __packed; + struct p_block_desc { + struct p_header80 head; u64 sector; u32 blksize; u32 pad; /* to multiple of 8 Byte */ @@ -482,6 +563,7 @@ enum drbd_bitmap_code { }; struct p_compressed_bm { + struct p_header80 head; /* (encoding & 0x0f): actual encoding, see enum drbd_bitmap_code * (encoding & 0x80): polarity (set/unset) of first runlength * ((encoding >> 4) & 0x07): pad_bits, number of trailing zero bits @@ -493,22 +575,90 @@ struct p_compressed_bm { } __packed; struct p_delay_probe93 { + struct p_header80 head; u32 seq_num; /* sequence number to match the two probe packets */ u32 offset; /* usecs the probe got sent after the reference time point */ } __packed; -/* - * Bitmap packets need to fit within a single page on the sender and receiver, - * so we are limited to 4 KiB (and not to PAGE_SIZE, which can be bigger). +/* DCBP: Drbd Compressed Bitmap Packet ... */ +static inline enum drbd_bitmap_code +DCBP_get_code(struct p_compressed_bm *p) +{ + return (enum drbd_bitmap_code)(p->encoding & 0x0f); +} + +static inline void +DCBP_set_code(struct p_compressed_bm *p, enum drbd_bitmap_code code) +{ + BUG_ON(code & ~0xf); + p->encoding = (p->encoding & ~0xf) | code; +} + +static inline int +DCBP_get_start(struct p_compressed_bm *p) +{ + return (p->encoding & 0x80) != 0; +} + +static inline void +DCBP_set_start(struct p_compressed_bm *p, int set) +{ + p->encoding = (p->encoding & ~0x80) | (set ? 0x80 : 0); +} + +static inline int +DCBP_get_pad_bits(struct p_compressed_bm *p) +{ + return (p->encoding >> 4) & 0x7; +} + +static inline void +DCBP_set_pad_bits(struct p_compressed_bm *p, int n) +{ + BUG_ON(n & ~0x7); + p->encoding = (p->encoding & (~0x7 << 4)) | (n << 4); +} + +/* one bitmap packet, including the p_header, + * should fit within one _architecture independend_ page. + * so we need to use the fixed size 4KiB page size + * most architectures have used for a long time. */ -#define DRBD_SOCKET_BUFFER_SIZE 4096 +#define BM_PACKET_PAYLOAD_BYTES (4096 - sizeof(struct p_header80)) +#define BM_PACKET_WORDS (BM_PACKET_PAYLOAD_BYTES/sizeof(long)) +#define BM_PACKET_VLI_BYTES_MAX (4096 - sizeof(struct p_compressed_bm)) +#if (PAGE_SIZE < 4096) +/* drbd_send_bitmap / receive_bitmap would break horribly */ +#error "PAGE_SIZE too small" +#endif + +union p_polymorph { + union p_header header; + struct p_handshake handshake; + struct p_data data; + struct p_block_ack block_ack; + struct p_barrier barrier; + struct p_barrier_ack barrier_ack; + struct p_rs_param_89 rs_param_89; + struct p_rs_param_95 rs_param_95; + struct p_protocol protocol; + struct p_sizes sizes; + struct p_uuids uuids; + struct p_state state; + struct p_req_state req_state; + struct p_req_state_reply req_state_reply; + struct p_block_req block_req; + struct p_delay_probe93 delay_probe93; + struct p_rs_uuid rs_uuid; + struct p_block_desc block_desc; +} __packed; /**********************************************************************/ enum drbd_thread_state { - NONE, - RUNNING, - EXITING, - RESTARTING + None, + Running, + Exiting, + Restarting }; struct drbd_thread { @@ -517,9 +667,8 @@ struct drbd_thread { struct completion stop; enum drbd_thread_state t_state; int (*function) (struct drbd_thread *); - struct drbd_tconn *tconn; + struct drbd_conf *mdev; int reset_cpu_mask; - char name[9]; }; static inline enum drbd_thread_state get_t_state(struct drbd_thread *thi) @@ -532,54 +681,58 @@ static inline enum drbd_thread_state get_t_state(struct drbd_thread *thi) return thi->t_state; } +struct drbd_work; +typedef int (*drbd_work_cb)(struct drbd_conf *, struct drbd_work *, int cancel); struct drbd_work { struct list_head list; - int (*cb)(struct drbd_work *, int cancel); - union { - struct drbd_conf *mdev; - struct drbd_tconn *tconn; - }; + drbd_work_cb cb; }; -#include "drbd_interval.h" - -extern int drbd_wait_misc(struct drbd_conf *, struct drbd_interval *); - +struct drbd_tl_epoch; struct drbd_request { struct drbd_work w; + struct drbd_conf *mdev; /* if local IO is not allowed, will be NULL. * if local IO _is_ allowed, holds the locally submitted bio clone, * or, after local IO completion, the ERR_PTR(error). - * see drbd_request_endio(). */ + * see drbd_endio_pri(). */ struct bio *private_bio; - struct drbd_interval i; + struct hlist_node collision; + sector_t sector; + unsigned int size; + unsigned int epoch; /* barrier_nr */ - /* epoch: used to check on "completion" whether this req was in + /* barrier_nr: used to check on "completion" whether this req was in * the current epoch, and we therefore have to close it, - * causing a p_barrier packet to be send, starting a new epoch. - * - * This corresponds to "barrier" in struct p_barrier[_ack], - * and to "barrier_nr" in struct drbd_epoch (and various - * comments/function parameters/local variable names). + * starting a new epoch... */ - unsigned int epoch; struct list_head tl_requests; /* ring list in the transfer log */ struct bio *master_bio; /* master bio pointer */ + unsigned long rq_state; /* see comments above _req_mod() */ unsigned long start_time; +}; - /* once it hits 0, we may complete the master_bio */ - atomic_t completion_ref; - /* once it hits 0, we may destroy this drbd_request object */ - struct kref kref; - - unsigned rq_state; /* see comments above _req_mod() */ +struct drbd_tl_epoch { + struct drbd_work w; + struct list_head requests; /* requests before */ + struct drbd_tl_epoch *next; /* pointer to the next barrier */ + unsigned int br_number; /* the barriers identifier. */ + int n_writes; /* number of requests attached before this barrier */ }; +struct drbd_request; + +/* These Tl_epoch_entries may be in one of 6 lists: + active_ee .. data packet being written + sync_ee .. syncer block being written + done_ee .. block written, need to send P_WRITE_ACK + read_ee .. [RS]P_DATA_REQUEST being read +*/ + struct drbd_epoch { - struct drbd_tconn *tconn; struct list_head list; unsigned int barrier_nr; atomic_t epoch_size; /* increased on every request added. */ @@ -609,14 +762,17 @@ struct digest_info { void *digest; }; -struct drbd_peer_request { +struct drbd_epoch_entry { struct drbd_work w; + struct hlist_node collision; struct drbd_epoch *epoch; /* for writes */ + struct drbd_conf *mdev; struct page *pages; atomic_t pending_bios; - struct drbd_interval i; + unsigned int size; /* see comments on ee flag bits below */ unsigned long flags; + sector_t sector; union { u64 block_id; struct digest_info *digest; @@ -637,37 +793,31 @@ enum { * we need to resubmit without the barrier flag. */ __EE_RESUBMITTED, - /* we may have several bios per peer request. + /* we may have several bios per epoch entry. * if any of those fail, we set this flag atomically * from the endio callback */ __EE_WAS_ERROR, /* This ee has a pointer to a digest instead of a block id */ __EE_HAS_DIGEST, - - /* Conflicting local requests need to be restarted after this request */ - __EE_RESTART_REQUESTS, - - /* The peer wants a write ACK for this (wire proto C) */ - __EE_SEND_WRITE_ACK, - - /* Is set when net_conf had two_primaries set while creating this peer_req */ - __EE_IN_INTERVAL_TREE, }; #define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO) #define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC) #define EE_RESUBMITTED (1<<__EE_RESUBMITTED) #define EE_WAS_ERROR (1<<__EE_WAS_ERROR) #define EE_HAS_DIGEST (1<<__EE_HAS_DIGEST) -#define EE_RESTART_REQUESTS (1<<__EE_RESTART_REQUESTS) -#define EE_SEND_WRITE_ACK (1<<__EE_SEND_WRITE_ACK) -#define EE_IN_INTERVAL_TREE (1<<__EE_IN_INTERVAL_TREE) -/* flag bits per mdev */ +/* global flag bits */ enum { + CREATE_BARRIER, /* next P_DATA is preceded by a P_BARRIER */ + SIGNAL_ASENDER, /* whether asender wants to be interrupted */ + SEND_PING, /* whether asender should send a ping asap */ + UNPLUG_REMOTE, /* sending a "UnplugRemote" could help */ MD_DIRTY, /* current uuids and flags not yet on disk */ + DISCARD_CONCURRENT, /* Set on one node, cleared on the peer! */ USE_DEGR_WFC_T, /* degr-wfc-timeout instead of wfc-timeout. */ + CLUSTER_ST_CHANGE, /* Cluster wide state change going on... */ CL_ST_CHG_SUCCESS, CL_ST_CHG_FAIL, CRASHED_PRIMARY, /* This node was a crashed primary. @@ -681,18 +831,32 @@ enum { once no more io in flight, start bitmap io */ BITMAP_IO_QUEUED, /* Started bitmap IO */ GO_DISKLESS, /* Disk is being detached, on io-error or admin request. */ - WAS_IO_ERROR, /* Local disk failed, returned IO error */ - WAS_READ_ERROR, /* Local disk READ failed (set additionally to the above) */ + WAS_IO_ERROR, /* Local disk failed returned IO error */ FORCE_DETACH, /* Force-detach from local disk, aborting any pending local IO */ RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */ + NET_CONGESTED, /* The data socket is congested */ + + CONFIG_PENDING, /* serialization of (re)configuration requests. + * if set, also prevents the device from dying */ + DEVICE_DYING, /* device became unconfigured, + * but worker thread is still handling the cleanup. + * reconfiguring (nl_disk_conf, nl_net_conf) is dissalowed, + * while this is set. */ RESIZE_PENDING, /* Size change detected locally, waiting for the response from * the peer, if it changed there as well. */ + CONN_DRY_RUN, /* Expect disconnect after resync handshake. */ + GOT_PING_ACK, /* set when we receive a ping_ack packet, misc wait gets woken */ NEW_CUR_UUID, /* Create new current UUID when thawing IO */ AL_SUSPENDED, /* Activity logging is currently suspended. */ AHEAD_TO_SYNC_SOURCE, /* Ahead -> SyncSource queued */ - B_RS_H_DONE, /* Before resync handler done (already executed) */ - DISCARD_MY_DATA, /* discard_my_data flag per volume */ - READ_BALANCE_RR, + STATE_SENT, /* Do not change state/UUIDs while this is set */ + + CALLBACK_PENDING, /* Whether we have a call_usermodehelper(, UMH_WAIT_PROC) + * pending, from drbd worker context. + * If set, bdi_write_congested() returns true, + * so shrink_page_list() would not recurse into, + * and potentially deadlock on, this drbd worker. + */ }; struct drbd_bitmap; /* opaque for drbd_conf */ @@ -730,24 +894,24 @@ enum bm_flag { struct drbd_work_queue { struct list_head q; + struct semaphore s; /* producers up it, worker down()s it */ spinlock_t q_lock; /* to protect the list. */ - wait_queue_head_t q_wait; }; struct drbd_socket { + struct drbd_work_queue work; struct mutex mutex; struct socket *socket; /* this way we get our * send/receive buffers off the stack */ - void *sbuf; - void *rbuf; + union p_polymorph sbuf; + union p_polymorph rbuf; }; struct drbd_md { u64 md_offset; /* sector offset to 'super' block */ u64 la_size_sect; /* last agreed size, unit sectors */ - spinlock_t uuid_lock; u64 uuid[UI_SIZE]; u64 device_uuid; u32 flags; @@ -757,16 +921,24 @@ struct drbd_md { s32 bm_offset; /* signed relative sector offset to bitmap */ /* u32 al_nr_extents; important for restoring the AL - * is stored into ldev->dc.al_extents, which in turn + * is stored into sync_conf.al_extents, which in turn * gets applied to act_log->nr_elements */ }; +/* for sync_conf and other types... */ +#define NL_PACKET(name, number, fields) struct name { fields }; +#define NL_INTEGER(pn,pr,member) int member; +#define NL_INT64(pn,pr,member) __u64 member; +#define NL_BIT(pn,pr,member) unsigned member:1; +#define NL_STRING(pn,pr,member,len) unsigned char member[len]; int member ## _len; +#include + struct drbd_backing_dev { struct block_device *backing_bdev; struct block_device *md_bdev; struct drbd_md md; - struct disk_conf *disk_conf; /* RCU, for updates: mdev->tconn->conf_update */ + struct disk_conf dc; /* The user provided config... */ sector_t known_size; /* last known size of that backing device */ }; @@ -790,116 +962,18 @@ enum write_ordering_e { }; struct fifo_buffer { + int *values; unsigned int head_index; unsigned int size; - int total; /* sum of all values */ - int values[0]; -}; -extern struct fifo_buffer *fifo_alloc(int fifo_size); - -/* flag bits per tconn */ -enum { - NET_CONGESTED, /* The data socket is congested */ - RESOLVE_CONFLICTS, /* Set on one node, cleared on the peer! */ - SEND_PING, /* whether asender should send a ping asap */ - SIGNAL_ASENDER, /* whether asender wants to be interrupted */ - GOT_PING_ACK, /* set when we receive a ping_ack packet, ping_wait gets woken */ - CONN_WD_ST_CHG_REQ, /* A cluster wide state change on the connection is active */ - CONN_WD_ST_CHG_OKAY, - CONN_WD_ST_CHG_FAIL, - CONN_DRY_RUN, /* Expect disconnect after resync handshake. */ - CREATE_BARRIER, /* next P_DATA is preceded by a P_BARRIER */ - STATE_SENT, /* Do not change state/UUIDs while this is set */ - CALLBACK_PENDING, /* Whether we have a call_usermodehelper(, UMH_WAIT_PROC) - * pending, from drbd worker context. - * If set, bdi_write_congested() returns true, - * so shrink_page_list() would not recurse into, - * and potentially deadlock on, this drbd worker. - */ - DISCONNECT_SENT, -}; - -struct drbd_tconn { /* is a resource from the config file */ - char *name; /* Resource name */ - struct list_head all_tconn; /* linked on global drbd_tconns */ - struct kref kref; - struct idr volumes; /* to mdev mapping */ - enum drbd_conns cstate; /* Only C_STANDALONE to C_WF_REPORT_PARAMS */ - unsigned susp:1; /* IO suspended by user */ - unsigned susp_nod:1; /* IO suspended because no data */ - unsigned susp_fen:1; /* IO suspended because fence peer handler runs */ - struct mutex cstate_mutex; /* Protects graceful disconnects */ - - unsigned long flags; - struct net_conf *net_conf; /* content protected by rcu */ - struct mutex conf_update; /* mutex for ready-copy-update of net_conf and disk_conf */ - wait_queue_head_t ping_wait; /* Woken upon reception of a ping, and a state change */ - struct res_opts res_opts; - - struct sockaddr_storage my_addr; - int my_addr_len; - struct sockaddr_storage peer_addr; - int peer_addr_len; - - struct drbd_socket data; /* data/barrier/cstate/parameter packets */ - struct drbd_socket meta; /* ping/ack (metadata) packets */ - int agreed_pro_version; /* actually used protocol version */ - unsigned long last_received; /* in jiffies, either socket */ - unsigned int ko_count; - - spinlock_t req_lock; - - struct list_head transfer_log; /* all requests not yet fully processed */ - - struct crypto_hash *cram_hmac_tfm; - struct crypto_hash *integrity_tfm; /* checksums we compute, updates protected by tconn->data->mutex */ - struct crypto_hash *peer_integrity_tfm; /* checksums we verify, only accessed from receiver thread */ - struct crypto_hash *csums_tfm; - struct crypto_hash *verify_tfm; - void *int_dig_in; - void *int_dig_vv; - - /* receiver side */ - struct drbd_epoch *current_epoch; - spinlock_t epoch_lock; - unsigned int epochs; - enum write_ordering_e write_ordering; - atomic_t current_tle_nr; /* transfer log epoch number */ - unsigned current_tle_writes; /* writes seen within this tl epoch */ - - unsigned long last_reconnect_jif; - struct drbd_thread receiver; - struct drbd_thread worker; - struct drbd_thread asender; - cpumask_var_t cpu_mask; - - /* sender side */ - struct drbd_work_queue sender_work; - - struct { - /* whether this sender thread - * has processed a single write yet. */ - bool seen_any_write_yet; - - /* Which barrier number to send with the next P_BARRIER */ - int current_epoch_nr; - - /* how many write requests have been sent - * with req->epoch == current_epoch_nr. - * If none, no P_BARRIER will be sent. */ - unsigned current_epoch_writes; - } send; }; struct drbd_conf { - struct drbd_tconn *tconn; - int vnr; /* volume number within the connection */ - struct kref kref; - /* things that are stored as / read from meta data on disk */ unsigned long flags; /* configured by drbdsetup */ + struct net_conf *net_conf; /* protected by get_net_conf() and put_net_conf() */ + struct syncer_conf sync_conf; struct drbd_backing_dev *ldev __protected_by(local); sector_t p_size; /* partner's disk size */ @@ -907,7 +981,11 @@ struct drbd_conf { struct block_device *this_bdev; struct gendisk *vdisk; - unsigned long last_reattach_jif; + struct drbd_socket data; /* data/barrier/cstate/parameter packets */ + struct drbd_socket meta; /* ping/ack (metadata) packets */ + int agreed_pro_version; /* actually used protocol version */ + unsigned long last_received; /* in jiffies, either socket */ + unsigned int ko_count; struct drbd_work resync_work, unplug_work, go_diskless, @@ -927,9 +1005,10 @@ struct drbd_conf { /* Used after attach while negotiating new disk state. */ union drbd_state new_state_tmp; - union drbd_dev_state state; + union drbd_state state; wait_queue_head_t misc_wait; wait_queue_head_t state_wait; /* upon each state change. */ + wait_queue_head_t net_cnt_wait; unsigned int send_cnt; unsigned int recv_cnt; unsigned int read_cnt; @@ -939,12 +1018,17 @@ struct drbd_conf { atomic_t ap_bio_cnt; /* Requests we need to complete */ atomic_t ap_pending_cnt; /* AP data packets on the wire, ack expected */ atomic_t rs_pending_cnt; /* RS request/data packets on the wire */ - atomic_t unacked_cnt; /* Need to send replies for */ + atomic_t unacked_cnt; /* Need to send replys for */ atomic_t local_cnt; /* Waiting for local completion */ - - /* Interval tree of pending local requests */ - struct rb_root read_requests; - struct rb_root write_requests; + atomic_t net_cnt; /* Users of net_conf */ + spinlock_t req_lock; + struct drbd_tl_epoch *unused_spare_tle; /* for pre-allocation */ + struct drbd_tl_epoch *newest_tle; + struct drbd_tl_epoch *oldest_tle; + struct list_head out_of_sequence_requests; + struct list_head barrier_acked_requests; + struct hlist_head *tl_hash; + unsigned int tl_hash_s; /* blocks to resync in this run [unit BM_BLOCK_SIZE] */ unsigned long rs_total; @@ -964,11 +1048,9 @@ struct drbd_conf { unsigned long rs_mark_time[DRBD_SYNC_MARKS]; /* current index into rs_mark_{left,time} */ int rs_last_mark; - unsigned long rs_last_bcast; /* [unit jiffies] */ /* where does the admin want us to start? (sector) */ sector_t ov_start_sector; - sector_t ov_stop_sector; /* where are we now? (sector) */ sector_t ov_position; /* Start sector of out of sync range (to merge printk reporting). */ @@ -976,7 +1058,14 @@ struct drbd_conf { /* size of out-of-sync range in sectors. */ sector_t ov_last_oos_size; unsigned long ov_left; /* in bits */ + struct crypto_hash *csums_tfm; + struct crypto_hash *verify_tfm; + unsigned long last_reattach_jif; + unsigned long last_reconnect_jif; + struct drbd_thread receiver; + struct drbd_thread worker; + struct drbd_thread asender; struct drbd_bitmap *bitmap; unsigned long bm_resync_fo; /* bit offset for drbd_bm_find_next */ @@ -989,19 +1078,29 @@ struct drbd_conf { int open_cnt; u64 *p_uuid; - + struct drbd_epoch *current_epoch; + spinlock_t epoch_lock; + unsigned int epochs; + enum write_ordering_e write_ordering; struct list_head active_ee; /* IO in progress (P_DATA gets written to disk) */ struct list_head sync_ee; /* IO in progress (P_RS_DATA_REPLY gets written to disk) */ - struct list_head done_ee; /* need to send P_WRITE_ACK */ - struct list_head read_ee; /* [RS]P_DATA_REQUEST being read */ + struct list_head done_ee; /* send ack */ + struct list_head read_ee; /* IO in progress (any read) */ struct list_head net_ee; /* zero-copy network send in progress */ + struct hlist_head *ee_hash; /* is proteced by req_lock! */ + unsigned int ee_hash_s; + + /* this one is protected by ee_lock, single thread */ + struct drbd_epoch_entry *last_write_w_barrier; int next_barrier_nr; + struct hlist_head *app_reads_hash; /* is proteced by req_lock */ struct list_head resync_reads; atomic_t pp_in_use; /* allocated from page pool */ atomic_t pp_in_use_by_net; /* sendpage()d, still referenced by tcp */ wait_queue_head_t ee_wait; struct page *md_io_page; /* one page buffer for md_io */ + struct page *md_io_tmpp; /* for logical_block_size != 512 */ struct drbd_md_io md_io; atomic_t md_io_in_use; /* protects the md_io, md_io_page and md_io_tmpp */ spinlock_t al_lock; @@ -1010,16 +1109,22 @@ struct drbd_conf { unsigned int al_tr_number; int al_tr_cycle; int al_tr_pos; /* position of the next transaction in the journal */ + struct crypto_hash *cram_hmac_tfm; + struct crypto_hash *integrity_w_tfm; /* to be used by the worker thread */ + struct crypto_hash *integrity_r_tfm; /* to be used by the receiver thread */ + void *int_dig_out; + void *int_dig_in; + void *int_dig_vv; wait_queue_head_t seq_wait; atomic_t packet_seq; unsigned int peer_seq; spinlock_t peer_seq_lock; unsigned int minor; unsigned long comm_bm_set; /* communicated number of set bits. */ + cpumask_var_t cpu_mask; struct bm_io_work bm_io_work; u64 ed_uuid; /* UUID of the exposed data */ - struct mutex own_state_mutex; - struct mutex *state_mutex; /* either own_state_mutex or mdev->tconn->cstate_mutex */ + struct mutex state_mutex; char congestion_reason; /* Why we where congested... */ atomic_t rs_sect_in; /* for incoming resync data rate, SyncTarget */ atomic_t rs_sect_ev; /* for submitted resync data rate, both */ @@ -1027,8 +1132,9 @@ struct drbd_conf { int rs_last_events; /* counter of read or write "events" (unit sectors) * on the lower level device when we last looked. */ int c_sync_rate; /* current resync rate after syncer throttle magic */ - struct fifo_buffer *rs_plan_s; /* correction values of resync planer (RCU, tconn->conn_update) */ + struct fifo_buffer rs_plan_s; /* correction values of resync planer */ int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */ + int rs_planed; /* resync sectors already planned */ atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */ unsigned int peer_max_bio_size; unsigned int local_max_bio_size; @@ -1036,7 +1142,11 @@ struct drbd_conf { static inline struct drbd_conf *minor_to_mdev(unsigned int minor) { - return (struct drbd_conf *)idr_find(&minors, minor); + struct drbd_conf *mdev; + + mdev = minor < minor_count ? minor_table[minor] : NULL; + + return mdev; } static inline unsigned int mdev_to_minor(struct drbd_conf *mdev) @@ -1044,9 +1154,29 @@ static inline unsigned int mdev_to_minor(struct drbd_conf *mdev) return mdev->minor; } -static inline struct drbd_conf *vnr_to_mdev(struct drbd_tconn *tconn, int vnr) +/* returns 1 if it was successful, + * returns 0 if there was no data socket. + * so wherever you are going to use the data.socket, e.g. do + * if (!drbd_get_data_sock(mdev)) + * return 0; + * CODE(); + * drbd_put_data_sock(mdev); + */ +static inline int drbd_get_data_sock(struct drbd_conf *mdev) +{ + mutex_lock(&mdev->data.mutex); + /* drbd_disconnect() could have called drbd_free_sock() + * while we were waiting in down()... */ + if (unlikely(mdev->data.socket == NULL)) { + mutex_unlock(&mdev->data.mutex); + return 0; + } + return 1; +} + +static inline void drbd_put_data_sock(struct drbd_conf *mdev) { - return (struct drbd_conf *)idr_find(&tconn->volumes, vnr); + mutex_unlock(&mdev->data.mutex); } /* @@ -1055,77 +1185,106 @@ static inline struct drbd_conf *vnr_to_mdev(struct drbd_tconn *tconn, int vnr) /* drbd_main.c */ +enum chg_state_flags { + CS_HARD = 1, + CS_VERBOSE = 2, + CS_WAIT_COMPLETE = 4, + CS_SERIALIZE = 8, + CS_ORDERED = CS_WAIT_COMPLETE + CS_SERIALIZE, +}; + enum dds_flags { DDSF_FORCED = 1, DDSF_NO_RESYNC = 2, /* Do not run a resync for the new space */ }; extern void drbd_init_set_defaults(struct drbd_conf *mdev); +extern enum drbd_state_rv drbd_change_state(struct drbd_conf *mdev, + enum chg_state_flags f, + union drbd_state mask, + union drbd_state val); +extern void drbd_force_state(struct drbd_conf *, union drbd_state, + union drbd_state); +extern enum drbd_state_rv _drbd_request_state(struct drbd_conf *, + union drbd_state, + union drbd_state, + enum chg_state_flags); +extern enum drbd_state_rv __drbd_set_state(struct drbd_conf *, union drbd_state, + enum chg_state_flags, + struct completion *done); +extern void print_st_err(struct drbd_conf *, union drbd_state, + union drbd_state, int); extern int drbd_thread_start(struct drbd_thread *thi); extern void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait); -extern char *drbd_task_to_thread_name(struct drbd_tconn *tconn, struct task_struct *task); #ifdef CONFIG_SMP -extern void drbd_thread_current_set_cpu(struct drbd_thread *thi); -extern void drbd_calc_cpu_mask(struct drbd_tconn *tconn); +extern void drbd_thread_current_set_cpu(struct drbd_conf *mdev); +extern void drbd_calc_cpu_mask(struct drbd_conf *mdev); #else #define drbd_thread_current_set_cpu(A) ({}) #define drbd_calc_cpu_mask(A) ({}) #endif -extern void tl_release(struct drbd_tconn *, unsigned int barrier_nr, +extern void drbd_free_resources(struct drbd_conf *mdev); +extern void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr, unsigned int set_size); -extern void tl_clear(struct drbd_tconn *); -extern void drbd_free_sock(struct drbd_tconn *tconn); -extern int drbd_send(struct drbd_tconn *tconn, struct socket *sock, - void *buf, size_t size, unsigned msg_flags); -extern int drbd_send_all(struct drbd_tconn *, struct socket *, void *, size_t, - unsigned); - -extern int __drbd_send_protocol(struct drbd_tconn *tconn, enum drbd_packet cmd); -extern int drbd_send_protocol(struct drbd_tconn *tconn); +extern void tl_clear(struct drbd_conf *mdev); +extern void _tl_add_barrier(struct drbd_conf *, struct drbd_tl_epoch *); +extern void drbd_free_sock(struct drbd_conf *mdev); +extern int drbd_send(struct drbd_conf *mdev, struct socket *sock, + void *buf, size_t size, unsigned msg_flags); +extern int drbd_send_protocol(struct drbd_conf *mdev); extern int drbd_send_uuids(struct drbd_conf *mdev); extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev); -extern void drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev); +extern int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev); extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags); extern int drbd_send_state(struct drbd_conf *mdev, union drbd_state s); extern int drbd_send_current_state(struct drbd_conf *mdev); -extern int drbd_send_sync_param(struct drbd_conf *mdev); -extern void drbd_send_b_ack(struct drbd_tconn *tconn, u32 barrier_nr, - u32 set_size); -extern int drbd_send_ack(struct drbd_conf *, enum drbd_packet, - struct drbd_peer_request *); -extern void drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packet cmd, - struct p_block_req *rp); -extern void drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packet cmd, - struct p_data *dp, int data_size); -extern int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packet cmd, +extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock, + enum drbd_packets cmd, struct p_header80 *h, + size_t size, unsigned msg_flags); +#define USE_DATA_SOCKET 1 +#define USE_META_SOCKET 0 +extern int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket, + enum drbd_packets cmd, struct p_header80 *h, + size_t size); +extern int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, + char *data, size_t size); +extern int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc); +extern int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, + u32 set_size); +extern int drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd, + struct drbd_epoch_entry *e); +extern int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd, + struct p_block_req *rp); +extern int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd, + struct p_data *dp, int data_size); +extern int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd, sector_t sector, int blksize, u64 block_id); -extern int drbd_send_out_of_sync(struct drbd_conf *, struct drbd_request *); -extern int drbd_send_block(struct drbd_conf *, enum drbd_packet, - struct drbd_peer_request *); +extern int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req); +extern int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd, + struct drbd_epoch_entry *e); extern int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req); extern int drbd_send_drequest(struct drbd_conf *mdev, int cmd, sector_t sector, int size, u64 block_id); -extern int drbd_send_drequest_csum(struct drbd_conf *mdev, sector_t sector, - int size, void *digest, int digest_size, - enum drbd_packet cmd); +extern int drbd_send_drequest_csum(struct drbd_conf *mdev, + sector_t sector,int size, + void *digest, int digest_size, + enum drbd_packets cmd); extern int drbd_send_ov_request(struct drbd_conf *mdev,sector_t sector,int size); extern int drbd_send_bitmap(struct drbd_conf *mdev); -extern void drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode); -extern void conn_send_sr_reply(struct drbd_tconn *tconn, enum drbd_state_rv retcode); +extern int _drbd_send_bitmap(struct drbd_conf *mdev); +extern int drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode); extern void drbd_free_bc(struct drbd_backing_dev *ldev); extern void drbd_mdev_cleanup(struct drbd_conf *mdev); void drbd_print_uuids(struct drbd_conf *mdev, const char *text); -extern void conn_md_sync(struct drbd_tconn *tconn); extern void drbd_md_sync(struct drbd_conf *mdev); extern int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev); extern void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local); extern void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local); extern void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local); +extern void _drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local); extern void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local); -extern void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local); -extern void __drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local); extern void drbd_md_set_flag(struct drbd_conf *mdev, int flags) __must_hold(local); extern void drbd_md_clear_flag(struct drbd_conf *mdev, int flags)__must_hold(local); extern int drbd_md_test_flag(struct drbd_backing_dev *, int); @@ -1143,52 +1302,33 @@ extern void drbd_queue_bitmap_io(struct drbd_conf *mdev, extern int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why, enum bm_flag flags); -extern int drbd_bitmap_io_from_worker(struct drbd_conf *mdev, - int (*io_fn)(struct drbd_conf *), - char *why, enum bm_flag flags); extern int drbd_bmio_set_n_write(struct drbd_conf *mdev); extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev); extern void drbd_go_diskless(struct drbd_conf *mdev); extern void drbd_ldev_destroy(struct drbd_conf *mdev); + /* Meta data layout We reserve a 128MB Block (4k aligned) * either at the end of the backing device * or on a separate meta data device. */ -/* The following numbers are sectors */ -/* Allows up to about 3.8TB, so if you want more, - * you need to use the "flexible" meta data format. */ #define MD_RESERVED_SECT (128LU << 11) /* 128 MB, unit sectors */ -#define MD_AL_OFFSET 8 /* 8 Sectors after start of meta area */ -#define MD_AL_SECTORS 64 /* = 32 kB on disk activity log ring buffer */ -#define MD_BM_OFFSET (MD_AL_OFFSET + MD_AL_SECTORS) - -/* we do all meta data IO in 4k blocks */ -#define MD_BLOCK_SHIFT 12 -#define MD_BLOCK_SIZE (1< BIO_MAX_SIZE -#error Architecture not supported: DRBD_MAX_BIO_SIZE > BIO_MAX_SIZE -#endif +/* Sector shift value for the "hash" functions of tl_hash and ee_hash tables. + * With a value of 8 all IO in one 128K block make it to the same slot of the + * hash table. */ +#define HT_SHIFT 8 +#define DRBD_MAX_BIO_SIZE (1U<<(9+HT_SHIFT)) #define DRBD_MAX_BIO_SIZE_SAFE (1U << 12) /* Works always = 4k */ -#define DRBD_MAX_SIZE_H80_PACKET (1U << 15) /* Header 80 only allows packets up to 32KiB data */ -#define DRBD_MAX_BIO_SIZE_P95 (1U << 17) /* Protocol 95 to 99 allows bios up to 128KiB */ +#define DRBD_MAX_SIZE_H80_PACKET (1U << 15) /* The old header only allows packets up to 32Kib data */ + +/* Number of elements in the app_reads_hash */ +#define APP_R_HSIZE 15 extern int drbd_bm_init(struct drbd_conf *mdev); extern int drbd_bm_resize(struct drbd_conf *mdev, sector_t sectors, int set_new_bits); @@ -1334,11 +1468,11 @@ extern int drbd_bm_test_bit(struct drbd_conf *mdev, unsigned long bitnr); extern int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr); extern int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local); extern int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local); -extern void drbd_bm_mark_for_writeout(struct drbd_conf *mdev, int page_nr); extern int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local); -extern int drbd_bm_write_hinted(struct drbd_conf *mdev) __must_hold(local); extern int drbd_bm_write_all(struct drbd_conf *mdev) __must_hold(local); extern int drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local); +extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, + unsigned long al_enr); extern size_t drbd_bm_words(struct drbd_conf *mdev); extern unsigned long drbd_bm_bits(struct drbd_conf *mdev); extern sector_t drbd_bm_capacity(struct drbd_conf *mdev); @@ -1363,7 +1497,7 @@ extern void drbd_bm_unlock(struct drbd_conf *mdev); /* drbd_main.c */ extern struct kmem_cache *drbd_request_cache; -extern struct kmem_cache *drbd_ee_cache; /* peer requests */ +extern struct kmem_cache *drbd_ee_cache; /* epoch entries */ extern struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */ extern struct kmem_cache *drbd_al_ext_cache; /* activity log extents */ extern mempool_t *drbd_request_mempool; @@ -1403,22 +1537,12 @@ extern struct bio *bio_alloc_drbd(gfp_t gfp_mask); extern rwlock_t global_state_lock; -extern int conn_lowest_minor(struct drbd_tconn *tconn); -enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor, int vnr); -extern void drbd_minor_destroy(struct kref *kref); - -extern int set_resource_options(struct drbd_tconn *tconn, struct res_opts *res_opts); -extern struct drbd_tconn *conn_create(const char *name, struct res_opts *res_opts); -extern void conn_destroy(struct kref *kref); -struct drbd_tconn *conn_get_by_name(const char *name); -extern struct drbd_tconn *conn_get_by_addrs(void *my_addr, int my_addr_len, - void *peer_addr, int peer_addr_len); -extern void conn_free_crypto(struct drbd_tconn *tconn); +extern struct drbd_conf *drbd_new_device(unsigned int minor); +extern void drbd_free_mdev(struct drbd_conf *mdev); extern int proc_details; /* drbd_req */ -extern void __drbd_make_request(struct drbd_conf *, struct bio *, unsigned long); extern void drbd_make_request(struct request_queue *q, struct bio *bio); extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req); extern int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec); @@ -1426,11 +1550,10 @@ extern int is_valid_ar_handle(struct drbd_request *, sector_t); /* drbd_nl.c */ -extern int drbd_msg_put_info(const char *info); extern void drbd_suspend_io(struct drbd_conf *mdev); extern void drbd_resume_io(struct drbd_conf *mdev); extern char *ppsize(char *buf, unsigned long long size); -extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *, sector_t, int); +extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *, int); enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 }; extern enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *, enum dds_flags) __must_hold(local); extern void resync_after_online_grow(struct drbd_conf *); @@ -1438,14 +1561,13 @@ extern void drbd_reconsider_max_bio_size(struct drbd_conf *mdev); extern enum drbd_state_rv drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force); -extern bool conn_try_outdate_peer(struct drbd_tconn *tconn); -extern void conn_try_outdate_peer_async(struct drbd_tconn *tconn); +extern enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev); +extern void drbd_try_outdate_peer_async(struct drbd_conf *mdev); extern int drbd_khelper(struct drbd_conf *mdev, char *cmd); /* drbd_worker.c */ extern int drbd_worker(struct drbd_thread *thi); -enum drbd_ret_code drbd_resync_after_valid(struct drbd_conf *mdev, int o_minor); -void drbd_resync_after_changed(struct drbd_conf *mdev); +extern int drbd_alter_sa(struct drbd_conf *mdev, int na); extern void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side); extern void resume_next_sg(struct drbd_conf *mdev); extern void suspend_other_sg(struct drbd_conf *mdev); @@ -1454,13 +1576,13 @@ extern int drbd_resync_finished(struct drbd_conf *mdev); extern void *drbd_md_get_buffer(struct drbd_conf *mdev); extern void drbd_md_put_buffer(struct drbd_conf *mdev); extern int drbd_md_sync_page_io(struct drbd_conf *mdev, - struct drbd_backing_dev *bdev, sector_t sector, int rw); -extern void drbd_ov_out_of_sync_found(struct drbd_conf *, sector_t, int); -extern void wait_until_done_or_force_detached(struct drbd_conf *mdev, - struct drbd_backing_dev *bdev, unsigned int *done); + struct drbd_backing_dev *bdev, sector_t sector, int rw); +extern void wait_until_done_or_disk_failure(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, + unsigned int *done); +extern void drbd_ov_oos_found(struct drbd_conf*, sector_t, int); extern void drbd_rs_controller_reset(struct drbd_conf *mdev); -static inline void ov_out_of_sync_print(struct drbd_conf *mdev) +static inline void ov_oos_print(struct drbd_conf *mdev) { if (mdev->ov_last_oos_size) { dev_err(DEV, "Out of sync: start=%llu, size=%lu (sectors)\n", @@ -1472,102 +1594,97 @@ static inline void ov_out_of_sync_print(struct drbd_conf *mdev) extern void drbd_csum_bio(struct drbd_conf *, struct crypto_hash *, struct bio *, void *); -extern void drbd_csum_ee(struct drbd_conf *, struct crypto_hash *, - struct drbd_peer_request *, void *); +extern void drbd_csum_ee(struct drbd_conf *, struct crypto_hash *, struct drbd_epoch_entry *, void *); /* worker callbacks */ -extern int w_e_end_data_req(struct drbd_work *, int); -extern int w_e_end_rsdata_req(struct drbd_work *, int); -extern int w_e_end_csum_rs_req(struct drbd_work *, int); -extern int w_e_end_ov_reply(struct drbd_work *, int); -extern int w_e_end_ov_req(struct drbd_work *, int); -extern int w_ov_finished(struct drbd_work *, int); -extern int w_resync_timer(struct drbd_work *, int); -extern int w_send_write_hint(struct drbd_work *, int); -extern int w_make_resync_request(struct drbd_work *, int); -extern int w_send_dblock(struct drbd_work *, int); -extern int w_send_read_req(struct drbd_work *, int); -extern int w_prev_work_done(struct drbd_work *, int); -extern int w_e_reissue(struct drbd_work *, int); -extern int w_restart_disk_io(struct drbd_work *, int); -extern int w_send_out_of_sync(struct drbd_work *, int); -extern int w_start_resync(struct drbd_work *, int); +extern int w_req_cancel_conflict(struct drbd_conf *, struct drbd_work *, int); +extern int w_read_retry_remote(struct drbd_conf *, struct drbd_work *, int); +extern int w_e_end_data_req(struct drbd_conf *, struct drbd_work *, int); +extern int w_e_end_rsdata_req(struct drbd_conf *, struct drbd_work *, int); +extern int w_e_end_csum_rs_req(struct drbd_conf *, struct drbd_work *, int); +extern int w_e_end_ov_reply(struct drbd_conf *, struct drbd_work *, int); +extern int w_e_end_ov_req(struct drbd_conf *, struct drbd_work *, int); +extern int w_ov_finished(struct drbd_conf *, struct drbd_work *, int); +extern int w_resync_timer(struct drbd_conf *, struct drbd_work *, int); +extern int w_resume_next_sg(struct drbd_conf *, struct drbd_work *, int); +extern int w_send_write_hint(struct drbd_conf *, struct drbd_work *, int); +extern int w_send_dblock(struct drbd_conf *, struct drbd_work *, int); +extern int w_send_barrier(struct drbd_conf *, struct drbd_work *, int); +extern int w_send_read_req(struct drbd_conf *, struct drbd_work *, int); +extern int w_prev_work_done(struct drbd_conf *, struct drbd_work *, int); +extern int w_e_reissue(struct drbd_conf *, struct drbd_work *, int); +extern int w_restart_disk_io(struct drbd_conf *, struct drbd_work *, int); +extern int w_send_oos(struct drbd_conf *, struct drbd_work *, int); +extern int w_start_resync(struct drbd_conf *, struct drbd_work *, int); extern void resync_timer_fn(unsigned long data); extern void start_resync_timer_fn(unsigned long data); /* drbd_receiver.c */ extern int drbd_rs_should_slow_down(struct drbd_conf *mdev, sector_t sector); -extern int drbd_submit_peer_request(struct drbd_conf *, - struct drbd_peer_request *, const unsigned, - const int); -extern int drbd_free_peer_reqs(struct drbd_conf *, struct list_head *); -extern struct drbd_peer_request *drbd_alloc_peer_req(struct drbd_conf *, u64, - sector_t, unsigned int, - gfp_t) __must_hold(local); -extern void __drbd_free_peer_req(struct drbd_conf *, struct drbd_peer_request *, - int); -#define drbd_free_peer_req(m,e) __drbd_free_peer_req(m, e, 0) -#define drbd_free_net_peer_req(m,e) __drbd_free_peer_req(m, e, 1) -extern struct page *drbd_alloc_pages(struct drbd_conf *, unsigned int, bool); +extern int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, + const unsigned rw, const int fault_type); +extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list); +extern struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev, + u64 id, + sector_t sector, + unsigned int data_size, + gfp_t gfp_mask) __must_hold(local); +extern void drbd_free_some_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, + int is_net); +#define drbd_free_ee(m,e) drbd_free_some_ee(m, e, 0) +#define drbd_free_net_ee(m,e) drbd_free_some_ee(m, e, 1) +extern void drbd_wait_ee_list_empty(struct drbd_conf *mdev, + struct list_head *head); +extern void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, + struct list_head *head); extern void drbd_set_recv_tcq(struct drbd_conf *mdev, int tcq_enabled); extern void _drbd_clear_done_ee(struct drbd_conf *mdev, struct list_head *to_be_freed); -extern void conn_flush_workqueue(struct drbd_tconn *tconn); -extern int drbd_connected(struct drbd_conf *mdev); -static inline void drbd_flush_workqueue(struct drbd_conf *mdev) -{ - conn_flush_workqueue(mdev->tconn); -} +extern void drbd_flush_workqueue(struct drbd_conf *mdev); +extern void drbd_free_tl_hash(struct drbd_conf *mdev); -/* Yes, there is kernel_setsockopt, but only since 2.6.18. - * So we have our own copy of it here. */ +/* yes, there is kernel_setsockopt, but only since 2.6.18. we don't need to + * mess with get_fs/set_fs, we know we are KERNEL_DS always. */ static inline int drbd_setsockopt(struct socket *sock, int level, int optname, - char *optval, int optlen) + char __user *optval, int optlen) { - mm_segment_t oldfs = get_fs(); - char __user *uoptval; int err; - - uoptval = (char __user __force *)optval; - - set_fs(KERNEL_DS); if (level == SOL_SOCKET) - err = sock_setsockopt(sock, level, optname, uoptval, optlen); + err = sock_setsockopt(sock, level, optname, optval, optlen); else - err = sock->ops->setsockopt(sock, level, optname, uoptval, + err = sock->ops->setsockopt(sock, level, optname, optval, optlen); - set_fs(oldfs); return err; } static inline void drbd_tcp_cork(struct socket *sock) { - int val = 1; + int __user val = 1; (void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK, - (char*)&val, sizeof(val)); + (char __user *)&val, sizeof(val)); } static inline void drbd_tcp_uncork(struct socket *sock) { - int val = 0; + int __user val = 0; (void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK, - (char*)&val, sizeof(val)); + (char __user *)&val, sizeof(val)); } static inline void drbd_tcp_nodelay(struct socket *sock) { - int val = 1; + int __user val = 1; (void) drbd_setsockopt(sock, SOL_TCP, TCP_NODELAY, - (char*)&val, sizeof(val)); + (char __user *)&val, sizeof(val)); } static inline void drbd_tcp_quickack(struct socket *sock) { - int val = 2; + int __user val = 2; (void) drbd_setsockopt(sock, SOL_TCP, TCP_QUICKACK, - (char*)&val, sizeof(val)); + (char __user *)&val, sizeof(val)); } -void drbd_bump_write_ordering(struct drbd_tconn *tconn, enum write_ordering_e wo); +void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo); /* drbd_proc.c */ extern struct proc_dir_entry *drbd_proc; @@ -1576,8 +1693,8 @@ extern const char *drbd_conn_str(enum drbd_conns s); extern const char *drbd_role_str(enum drbd_role s); /* drbd_actlog.c */ -extern void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i); -extern void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i); +extern void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector); +extern void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector); extern void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector); extern int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector); extern int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector); @@ -1585,6 +1702,7 @@ extern void drbd_rs_cancel_all(struct drbd_conf *mdev); extern int drbd_rs_del_all(struct drbd_conf *mdev); extern void drbd_rs_failed_io(struct drbd_conf *mdev, sector_t sector, int size); +extern int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *); extern void drbd_advance_rs_marks(struct drbd_conf *mdev, unsigned long still_to_go); extern void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size, const char *file, const unsigned int line); @@ -1594,24 +1712,73 @@ extern int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size, const char *file, const unsigned int line); #define drbd_set_out_of_sync(mdev, sector, size) \ __drbd_set_out_of_sync(mdev, sector, size, __FILE__, __LINE__) +extern void drbd_al_apply_to_bm(struct drbd_conf *mdev); extern void drbd_al_shrink(struct drbd_conf *mdev); + /* drbd_nl.c */ -/* state info broadcast */ -struct sib_info { - enum drbd_state_info_bcast_reason sib_reason; - union { - struct { - char *helper_name; - unsigned helper_exit_code; - }; - struct { - union drbd_state os; - union drbd_state ns; - }; - }; -}; -void drbd_bcast_event(struct drbd_conf *mdev, const struct sib_info *sib); + +void drbd_nl_cleanup(void); +int __init drbd_nl_init(void); +void drbd_bcast_state(struct drbd_conf *mdev, union drbd_state); +void drbd_bcast_sync_progress(struct drbd_conf *mdev); +void drbd_bcast_ee(struct drbd_conf *mdev, + const char *reason, const int dgs, + const char* seen_hash, const char* calc_hash, + const struct drbd_epoch_entry* e); + + +/** + * DOC: DRBD State macros + * + * These macros are used to express state changes in easily readable form. + * + * The NS macros expand to a mask and a value, that can be bit ored onto the + * current state as soon as the spinlock (req_lock) was taken. + * + * The _NS macros are used for state functions that get called with the + * spinlock. These macros expand directly to the new state value. + * + * Besides the basic forms NS() and _NS() additional _?NS[23] are defined + * to express state changes that affect more than one aspect of the state. + * + * E.g. NS2(conn, C_CONNECTED, peer, R_SECONDARY) + * Means that the network connection was established and that the peer + * is in secondary role. + */ +#define role_MASK R_MASK +#define peer_MASK R_MASK +#define disk_MASK D_MASK +#define pdsk_MASK D_MASK +#define conn_MASK C_MASK +#define susp_MASK 1 +#define user_isp_MASK 1 +#define aftr_isp_MASK 1 +#define susp_nod_MASK 1 +#define susp_fen_MASK 1 + +#define NS(T, S) \ + ({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \ + ({ union drbd_state val; val.i = 0; val.T = (S); val; }) +#define NS2(T1, S1, T2, S2) \ + ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \ + mask.T2 = T2##_MASK; mask; }), \ + ({ union drbd_state val; val.i = 0; val.T1 = (S1); \ + val.T2 = (S2); val; }) +#define NS3(T1, S1, T2, S2, T3, S3) \ + ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \ + mask.T2 = T2##_MASK; mask.T3 = T3##_MASK; mask; }), \ + ({ union drbd_state val; val.i = 0; val.T1 = (S1); \ + val.T2 = (S2); val.T3 = (S3); val; }) + +#define _NS(D, T, S) \ + D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T = (S); __ns; }) +#define _NS2(D, T1, S1, T2, S2) \ + D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T1 = (S1); \ + __ns.T2 = (S2); __ns; }) +#define _NS3(D, T1, S1, T2, S2, T3, S3) \ + D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T1 = (S1); \ + __ns.T2 = (S2); __ns.T3 = (S3); __ns; }) /* * inline helper functions @@ -1628,10 +1795,9 @@ static inline struct page *page_chain_next(struct page *page) #define page_chain_for_each_safe(page, n) \ for (; page && ({ n = page_chain_next(page); 1; }); page = n) - -static inline int drbd_peer_req_has_active_page(struct drbd_peer_request *peer_req) +static inline int drbd_ee_has_active_page(struct drbd_epoch_entry *e) { - struct page *page = peer_req->pages; + struct page *page = e->pages; page_chain_for_each(page) { if (page_count(page) > 1) return 1; @@ -1639,6 +1805,18 @@ static inline int drbd_peer_req_has_active_page(struct drbd_peer_request *peer_r return 0; } +static inline void drbd_state_lock(struct drbd_conf *mdev) +{ + wait_event(mdev->misc_wait, + !test_and_set_bit(CLUSTER_ST_CHANGE, &mdev->flags)); +} + +static inline void drbd_state_unlock(struct drbd_conf *mdev) +{ + clear_bit(CLUSTER_ST_CHANGE, &mdev->flags); + wake_up(&mdev->misc_wait); +} + static inline enum drbd_state_rv _drbd_set_state(struct drbd_conf *mdev, union drbd_state ns, enum chg_state_flags flags, struct completion *done) @@ -1652,71 +1830,48 @@ _drbd_set_state(struct drbd_conf *mdev, union drbd_state ns, return rv; } -static inline union drbd_state drbd_read_state(struct drbd_conf *mdev) +/** + * drbd_request_state() - Reqest a state change + * @mdev: DRBD device. + * @mask: mask of state bits to change. + * @val: value of new state bits. + * + * This is the most graceful way of requesting a state change. It is verbose + * quite verbose in case the state change is not possible, and all those + * state changes are globally serialized. + */ +static inline int drbd_request_state(struct drbd_conf *mdev, + union drbd_state mask, + union drbd_state val) { - union drbd_state rv; - - rv.i = mdev->state.i; - rv.susp = mdev->tconn->susp; - rv.susp_nod = mdev->tconn->susp_nod; - rv.susp_fen = mdev->tconn->susp_fen; - - return rv; + return _drbd_request_state(mdev, mask, val, CS_VERBOSE + CS_ORDERED); } enum drbd_force_detach_flags { - DRBD_READ_ERROR, - DRBD_WRITE_ERROR, + DRBD_IO_ERROR, DRBD_META_IO_ERROR, DRBD_FORCE_DETACH, }; #define __drbd_chk_io_error(m,f) __drbd_chk_io_error_(m,f, __func__) static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, - enum drbd_force_detach_flags df, + enum drbd_force_detach_flags forcedetach, const char *where) { - enum drbd_io_error_p ep; - - rcu_read_lock(); - ep = rcu_dereference(mdev->ldev->disk_conf)->on_io_error; - rcu_read_unlock(); - switch (ep) { - case EP_PASS_ON: /* FIXME would this be better named "Ignore"? */ - if (df == DRBD_READ_ERROR || df == DRBD_WRITE_ERROR) { + switch (mdev->ldev->dc.on_io_error) { + case EP_PASS_ON: + if (forcedetach == DRBD_IO_ERROR) { if (__ratelimit(&drbd_ratelimit_state)) dev_err(DEV, "Local IO failed in %s.\n", where); if (mdev->state.disk > D_INCONSISTENT) _drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_HARD, NULL); break; } - /* NOTE fall through for DRBD_META_IO_ERROR or DRBD_FORCE_DETACH */ + /* NOTE fall through to detach case if forcedetach set */ case EP_DETACH: case EP_CALL_HELPER: - /* Remember whether we saw a READ or WRITE error. - * - * Recovery of the affected area for WRITE failure is covered - * by the activity log. - * READ errors may fall outside that area though. Certain READ - * errors can be "healed" by writing good data to the affected - * blocks, which triggers block re-allocation in lower layers. - * - * If we can not write the bitmap after a READ error, - * we may need to trigger a full sync (see w_go_diskless()). - * - * Force-detach is not really an IO error, but rather a - * desperate measure to try to deal with a completely - * unresponsive lower level IO stack. - * Still it should be treated as a WRITE error. - * - * Meta IO error is always WRITE error: - * we read meta data only once during attach, - * which will fail in case of errors. - */ set_bit(WAS_IO_ERROR, &mdev->flags); - if (df == DRBD_READ_ERROR) - set_bit(WAS_READ_ERROR, &mdev->flags); - if (df == DRBD_FORCE_DETACH) + if (forcedetach == DRBD_FORCE_DETACH) set_bit(FORCE_DETACH, &mdev->flags); if (mdev->state.disk > D_FAILED) { _drbd_set_state(_NS(mdev, disk, D_FAILED), CS_HARD, NULL); @@ -1741,9 +1896,9 @@ static inline void drbd_chk_io_error_(struct drbd_conf *mdev, { if (error) { unsigned long flags; - spin_lock_irqsave(&mdev->tconn->req_lock, flags); + spin_lock_irqsave(&mdev->req_lock, flags); __drbd_chk_io_error_(mdev, forcedetach, where); - spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); + spin_unlock_irqrestore(&mdev->req_lock, flags); } } @@ -1755,9 +1910,9 @@ static inline void drbd_chk_io_error_(struct drbd_conf *mdev, * BTW, for internal meta data, this happens to be the maximum capacity * we could agree upon with our peer node. */ -static inline sector_t _drbd_md_first_sector(int meta_dev_idx, struct drbd_backing_dev *bdev) +static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev) { - switch (meta_dev_idx) { + switch (bdev->dc.meta_dev_idx) { case DRBD_MD_INDEX_INTERNAL: case DRBD_MD_INDEX_FLEX_INT: return bdev->md.md_offset + bdev->md.bm_offset; @@ -1767,30 +1922,13 @@ static inline sector_t _drbd_md_first_sector(int meta_dev_idx, struct drbd_backi } } -static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev) -{ - int meta_dev_idx; - - rcu_read_lock(); - meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx; - rcu_read_unlock(); - - return _drbd_md_first_sector(meta_dev_idx, bdev); -} - /** * drbd_md_last_sector() - Return the last sector number of the meta data area * @bdev: Meta data block device. */ static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev) { - int meta_dev_idx; - - rcu_read_lock(); - meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx; - rcu_read_unlock(); - - switch (meta_dev_idx) { + switch (bdev->dc.meta_dev_idx) { case DRBD_MD_INDEX_INTERNAL: case DRBD_MD_INDEX_FLEX_INT: return bdev->md.md_offset + MD_AL_OFFSET - 1; @@ -1818,18 +1956,12 @@ static inline sector_t drbd_get_capacity(struct block_device *bdev) static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev) { sector_t s; - int meta_dev_idx; - - rcu_read_lock(); - meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx; - rcu_read_unlock(); - - switch (meta_dev_idx) { + switch (bdev->dc.meta_dev_idx) { case DRBD_MD_INDEX_INTERNAL: case DRBD_MD_INDEX_FLEX_INT: s = drbd_get_capacity(bdev->backing_bdev) ? min_t(sector_t, DRBD_MAX_SECTORS_FLEX, - _drbd_md_first_sector(meta_dev_idx, bdev)) + drbd_md_first_sector(bdev)) : 0; break; case DRBD_MD_INDEX_FLEX_EXT: @@ -1855,15 +1987,9 @@ static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev) static inline sector_t drbd_md_ss__(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) { - int meta_dev_idx; - - rcu_read_lock(); - meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx; - rcu_read_unlock(); - - switch (meta_dev_idx) { + switch (bdev->dc.meta_dev_idx) { default: /* external, some index */ - return MD_RESERVED_SECT * meta_dev_idx; + return MD_RESERVED_SECT * bdev->dc.meta_dev_idx; case DRBD_MD_INDEX_INTERNAL: /* with drbd08, internal meta data is always "flexible" */ case DRBD_MD_INDEX_FLEX_INT: @@ -1889,8 +2015,9 @@ drbd_queue_work_front(struct drbd_work_queue *q, struct drbd_work *w) unsigned long flags; spin_lock_irqsave(&q->q_lock, flags); list_add(&w->list, &q->q); + up(&q->s); /* within the spinlock, + see comment near end of drbd_worker() */ spin_unlock_irqrestore(&q->q_lock, flags); - wake_up(&q->q_wait); } static inline void @@ -1899,35 +2026,41 @@ drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w) unsigned long flags; spin_lock_irqsave(&q->q_lock, flags); list_add_tail(&w->list, &q->q); + up(&q->s); /* within the spinlock, + see comment near end of drbd_worker() */ spin_unlock_irqrestore(&q->q_lock, flags); - wake_up(&q->q_wait); } -static inline void wake_asender(struct drbd_tconn *tconn) +static inline void wake_asender(struct drbd_conf *mdev) +{ + if (test_bit(SIGNAL_ASENDER, &mdev->flags)) + force_sig(DRBD_SIG, mdev->asender.task); +} + +static inline void request_ping(struct drbd_conf *mdev) { - if (test_bit(SIGNAL_ASENDER, &tconn->flags)) - force_sig(DRBD_SIG, tconn->asender.task); + set_bit(SEND_PING, &mdev->flags); + wake_asender(mdev); } -static inline void request_ping(struct drbd_tconn *tconn) +static inline int drbd_send_short_cmd(struct drbd_conf *mdev, + enum drbd_packets cmd) { - set_bit(SEND_PING, &tconn->flags); - wake_asender(tconn); + struct p_header80 h; + return drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, &h, sizeof(h)); } -extern void *conn_prepare_command(struct drbd_tconn *, struct drbd_socket *); -extern void *drbd_prepare_command(struct drbd_conf *, struct drbd_socket *); -extern int conn_send_command(struct drbd_tconn *, struct drbd_socket *, - enum drbd_packet, unsigned int, void *, - unsigned int); -extern int drbd_send_command(struct drbd_conf *, struct drbd_socket *, - enum drbd_packet, unsigned int, void *, - unsigned int); +static inline int drbd_send_ping(struct drbd_conf *mdev) +{ + struct p_header80 h; + return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING, &h, sizeof(h)); +} -extern int drbd_send_ping(struct drbd_tconn *tconn); -extern int drbd_send_ping_ack(struct drbd_tconn *tconn); -extern int drbd_send_state_req(struct drbd_conf *, union drbd_state, union drbd_state); -extern int conn_send_state_req(struct drbd_tconn *, union drbd_state, union drbd_state); +static inline int drbd_send_ping_ack(struct drbd_conf *mdev) +{ + struct p_header80 h; + return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING_ACK, &h, sizeof(h)); +} static inline void drbd_thread_stop(struct drbd_thread *thi) { @@ -1949,21 +2082,21 @@ static inline void drbd_thread_restart_nowait(struct drbd_thread *thi) * or implicit barrier packets as necessary. * increased: * w_send_barrier - * _req_mod(req, QUEUE_FOR_NET_WRITE or QUEUE_FOR_NET_READ); + * _req_mod(req, queue_for_net_write or queue_for_net_read); * it is much easier and equally valid to count what we queue for the * worker, even before it actually was queued or send. * (drbd_make_request_common; recovery path on read io-error) * decreased: * got_BarrierAck (respective tl_clear, tl_clear_barrier) - * _req_mod(req, DATA_RECEIVED) + * _req_mod(req, data_received) * [from receive_DataReply] - * _req_mod(req, WRITE_ACKED_BY_PEER or RECV_ACKED_BY_PEER or NEG_ACKED) + * _req_mod(req, write_acked_by_peer or recv_acked_by_peer or neg_acked) * [from got_BlockAck (P_WRITE_ACK, P_RECV_ACK)] * for some reason it is NOT decreased in got_NegAck, * but in the resulting cleanup code from report_params. * we should try to remember the reason for that... - * _req_mod(req, SEND_FAILED or SEND_CANCELED) - * _req_mod(req, CONNECTION_LOST_WHILE_PENDING) + * _req_mod(req, send_failed or send_canceled) + * _req_mod(req, connection_lost_while_pending) * [from tl_clear_barrier] */ static inline void inc_ap_pending(struct drbd_conf *mdev) @@ -1971,19 +2104,17 @@ static inline void inc_ap_pending(struct drbd_conf *mdev) atomic_inc(&mdev->ap_pending_cnt); } -#define ERR_IF_CNT_IS_NEGATIVE(which, func, line) \ - if (atomic_read(&mdev->which) < 0) \ +#define ERR_IF_CNT_IS_NEGATIVE(which) \ + if (atomic_read(&mdev->which) < 0) \ dev_err(DEV, "in %s:%d: " #which " = %d < 0 !\n", \ - func, line, \ - atomic_read(&mdev->which)) + __func__ , __LINE__ , \ + atomic_read(&mdev->which)) -#define dec_ap_pending(mdev) _dec_ap_pending(mdev, __FUNCTION__, __LINE__) -static inline void _dec_ap_pending(struct drbd_conf *mdev, const char *func, int line) -{ - if (atomic_dec_and_test(&mdev->ap_pending_cnt)) - wake_up(&mdev->misc_wait); - ERR_IF_CNT_IS_NEGATIVE(ap_pending_cnt, func, line); -} +#define dec_ap_pending(mdev) do { \ + typecheck(struct drbd_conf *, mdev); \ + if (atomic_dec_and_test(&mdev->ap_pending_cnt)) \ + wake_up(&mdev->misc_wait); \ + ERR_IF_CNT_IS_NEGATIVE(ap_pending_cnt); } while (0) /* counts how many resync-related answers we still expect from the peer * increase decrease @@ -1996,12 +2127,10 @@ static inline void inc_rs_pending(struct drbd_conf *mdev) atomic_inc(&mdev->rs_pending_cnt); } -#define dec_rs_pending(mdev) _dec_rs_pending(mdev, __FUNCTION__, __LINE__) -static inline void _dec_rs_pending(struct drbd_conf *mdev, const char *func, int line) -{ - atomic_dec(&mdev->rs_pending_cnt); - ERR_IF_CNT_IS_NEGATIVE(rs_pending_cnt, func, line); -} +#define dec_rs_pending(mdev) do { \ + typecheck(struct drbd_conf *, mdev); \ + atomic_dec(&mdev->rs_pending_cnt); \ + ERR_IF_CNT_IS_NEGATIVE(rs_pending_cnt); } while (0) /* counts how many answers we still need to send to the peer. * increased on @@ -2017,18 +2146,38 @@ static inline void inc_unacked(struct drbd_conf *mdev) atomic_inc(&mdev->unacked_cnt); } -#define dec_unacked(mdev) _dec_unacked(mdev, __FUNCTION__, __LINE__) -static inline void _dec_unacked(struct drbd_conf *mdev, const char *func, int line) +#define dec_unacked(mdev) do { \ + typecheck(struct drbd_conf *, mdev); \ + atomic_dec(&mdev->unacked_cnt); \ + ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0) + +#define sub_unacked(mdev, n) do { \ + typecheck(struct drbd_conf *, mdev); \ + atomic_sub(n, &mdev->unacked_cnt); \ + ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0) + + +static inline void put_net_conf(struct drbd_conf *mdev) { - atomic_dec(&mdev->unacked_cnt); - ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line); + if (atomic_dec_and_test(&mdev->net_cnt)) + wake_up(&mdev->net_cnt_wait); } -#define sub_unacked(mdev, n) _sub_unacked(mdev, n, __FUNCTION__, __LINE__) -static inline void _sub_unacked(struct drbd_conf *mdev, int n, const char *func, int line) +/** + * get_net_conf() - Increase ref count on mdev->net_conf; Returns 0 if nothing there + * @mdev: DRBD device. + * + * You have to call put_net_conf() when finished working with mdev->net_conf. + */ +static inline int get_net_conf(struct drbd_conf *mdev) { - atomic_sub(n, &mdev->unacked_cnt); - ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line); + int have_net_conf; + + atomic_inc(&mdev->net_cnt); + have_net_conf = mdev->state.conn >= C_UNCONNECTED; + if (!have_net_conf) + put_net_conf(mdev); + return have_net_conf; } /** @@ -2132,20 +2281,17 @@ static inline void drbd_get_syncer_progress(struct drbd_conf *mdev, * maybe re-implement using semaphores? */ static inline int drbd_get_max_buffers(struct drbd_conf *mdev) { - struct net_conf *nc; - int mxb; - - rcu_read_lock(); - nc = rcu_dereference(mdev->tconn->net_conf); - mxb = nc ? nc->max_buffers : 1000000; /* arbitrary limit on open requests */ - rcu_read_unlock(); - + int mxb = 1000000; /* arbitrary limit on open requests */ + if (get_net_conf(mdev)) { + mxb = mdev->net_conf->max_buffers; + put_net_conf(mdev); + } return mxb; } static inline int drbd_state_is_stable(struct drbd_conf *mdev) { - union drbd_dev_state s = mdev->state; + union drbd_state s = mdev->state; /* DO NOT add a default clause, we want the compiler to warn us * for any newly introduced state we may have forgotten to add here */ @@ -2179,7 +2325,7 @@ static inline int drbd_state_is_stable(struct drbd_conf *mdev) /* Allow IO in BM exchange states with new protocols */ case C_WF_BITMAP_S: - if (mdev->tconn->agreed_pro_version < 96) + if (mdev->agreed_pro_version < 96) return 0; break; @@ -2201,7 +2347,7 @@ static inline int drbd_state_is_stable(struct drbd_conf *mdev) /* disk state is stable as well. */ break; - /* no new io accepted during transitional states */ + /* no new io accepted during tansitional states */ case D_ATTACHING: case D_NEGOTIATING: case D_UNKNOWN: @@ -2213,18 +2359,16 @@ static inline int drbd_state_is_stable(struct drbd_conf *mdev) return 1; } -static inline int drbd_suspended(struct drbd_conf *mdev) +static inline int is_susp(union drbd_state s) { - struct drbd_tconn *tconn = mdev->tconn; - - return tconn->susp || tconn->susp_fen || tconn->susp_nod; + return s.susp || s.susp_nod || s.susp_fen; } static inline bool may_inc_ap_bio(struct drbd_conf *mdev) { int mxb = drbd_get_max_buffers(mdev); - if (drbd_suspended(mdev)) + if (is_susp(mdev->state)) return false; if (test_bit(SUSPEND_IO, &mdev->flags)) return false; @@ -2246,30 +2390,30 @@ static inline bool may_inc_ap_bio(struct drbd_conf *mdev) return true; } -static inline bool inc_ap_bio_cond(struct drbd_conf *mdev) +static inline bool inc_ap_bio_cond(struct drbd_conf *mdev, int count) { bool rv = false; - spin_lock_irq(&mdev->tconn->req_lock); + spin_lock_irq(&mdev->req_lock); rv = may_inc_ap_bio(mdev); if (rv) - atomic_inc(&mdev->ap_bio_cnt); - spin_unlock_irq(&mdev->tconn->req_lock); + atomic_add(count, &mdev->ap_bio_cnt); + spin_unlock_irq(&mdev->req_lock); return rv; } -static inline void inc_ap_bio(struct drbd_conf *mdev) +static inline void inc_ap_bio(struct drbd_conf *mdev, int count) { /* we wait here * as long as the device is suspended * until the bitmap is no longer on the fly during connection - * handshake as long as we would exceed the max_buffer limit. + * handshake as long as we would exeed the max_buffer limit. * * to avoid races with the reconnect code, * we need to atomic_inc within the spinlock. */ - wait_event(mdev->misc_wait, inc_ap_bio_cond(mdev)); + wait_event(mdev->misc_wait, inc_ap_bio_cond(mdev, count)); } static inline void dec_ap_bio(struct drbd_conf *mdev) @@ -2281,7 +2425,7 @@ static inline void dec_ap_bio(struct drbd_conf *mdev) if (ap_bio == 0 && test_bit(BITMAP_IO, &mdev->flags)) { if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags)) - drbd_queue_work(&mdev->tconn->sender_work, &mdev->bm_io_work.w); + drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w); } /* this currently does wake_up for every dec_ap_bio! @@ -2291,12 +2435,6 @@ static inline void dec_ap_bio(struct drbd_conf *mdev) wake_up(&mdev->misc_wait); } -static inline bool verify_can_do_stop_sector(struct drbd_conf *mdev) -{ - return mdev->tconn->agreed_pro_version >= 97 && - mdev->tconn->agreed_pro_version != 100; -} - static inline int drbd_set_ed_uuid(struct drbd_conf *mdev, u64 val) { int changed = mdev->ed_uuid != val; @@ -2304,6 +2442,40 @@ static inline int drbd_set_ed_uuid(struct drbd_conf *mdev, u64 val) return changed; } +static inline int seq_cmp(u32 a, u32 b) +{ + /* we assume wrap around at 32bit. + * for wrap around at 24bit (old atomic_t), + * we'd have to + * a <<= 8; b <<= 8; + */ + return (s32)(a) - (s32)(b); +} +#define seq_lt(a, b) (seq_cmp((a), (b)) < 0) +#define seq_gt(a, b) (seq_cmp((a), (b)) > 0) +#define seq_ge(a, b) (seq_cmp((a), (b)) >= 0) +#define seq_le(a, b) (seq_cmp((a), (b)) <= 0) +/* CAUTION: please no side effects in arguments! */ +#define seq_max(a, b) ((u32)(seq_gt((a), (b)) ? (a) : (b))) + +static inline void update_peer_seq(struct drbd_conf *mdev, unsigned int new_seq) +{ + unsigned int m; + spin_lock(&mdev->peer_seq_lock); + m = seq_max(mdev->peer_seq, new_seq); + mdev->peer_seq = m; + spin_unlock(&mdev->peer_seq_lock); + if (m == new_seq) + wake_up(&mdev->seq_wait); +} + +static inline void drbd_update_congested(struct drbd_conf *mdev) +{ + struct sock *sk = mdev->data.socket->sk; + if (sk->sk_wmem_queued > sk->sk_sndbuf * 4 / 5) + set_bit(NET_CONGESTED, &mdev->flags); +} + static inline int drbd_queue_order_type(struct drbd_conf *mdev) { /* sorry, we currently have no working implementation @@ -2318,15 +2490,10 @@ static inline void drbd_md_flush(struct drbd_conf *mdev) { int r; - if (mdev->ldev == NULL) { - dev_warn(DEV, "mdev->ldev == NULL in drbd_md_flush\n"); - return; - } - if (test_bit(MD_NO_FUA, &mdev->flags)) return; - r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_NOIO, NULL); + r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_KERNEL, NULL); if (r) { set_bit(MD_NO_FUA, &mdev->flags); dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r); diff --git a/trunk/drivers/block/drbd/drbd_interval.c b/trunk/drivers/block/drbd/drbd_interval.c deleted file mode 100644 index 89c497c630b4..000000000000 --- a/trunk/drivers/block/drbd/drbd_interval.c +++ /dev/null @@ -1,207 +0,0 @@ -#include -#include -#include "drbd_interval.h" - -/** - * interval_end - return end of @node - */ -static inline -sector_t interval_end(struct rb_node *node) -{ - struct drbd_interval *this = rb_entry(node, struct drbd_interval, rb); - return this->end; -} - -/** - * compute_subtree_last - compute end of @node - * - * The end of an interval is the highest (start + (size >> 9)) value of this - * node and of its children. Called for @node and its parents whenever the end - * may have changed. - */ -static inline sector_t -compute_subtree_last(struct drbd_interval *node) -{ - sector_t max = node->sector + (node->size >> 9); - - if (node->rb.rb_left) { - sector_t left = interval_end(node->rb.rb_left); - if (left > max) - max = left; - } - if (node->rb.rb_right) { - sector_t right = interval_end(node->rb.rb_right); - if (right > max) - max = right; - } - return max; -} - -static void augment_propagate(struct rb_node *rb, struct rb_node *stop) -{ - while (rb != stop) { - struct drbd_interval *node = rb_entry(rb, struct drbd_interval, rb); - sector_t subtree_last = compute_subtree_last(node); - if (node->end == subtree_last) - break; - node->end = subtree_last; - rb = rb_parent(&node->rb); - } -} - -static void augment_copy(struct rb_node *rb_old, struct rb_node *rb_new) -{ - struct drbd_interval *old = rb_entry(rb_old, struct drbd_interval, rb); - struct drbd_interval *new = rb_entry(rb_new, struct drbd_interval, rb); - - new->end = old->end; -} - -static void augment_rotate(struct rb_node *rb_old, struct rb_node *rb_new) -{ - struct drbd_interval *old = rb_entry(rb_old, struct drbd_interval, rb); - struct drbd_interval *new = rb_entry(rb_new, struct drbd_interval, rb); - - new->end = old->end; - old->end = compute_subtree_last(old); -} - -static const struct rb_augment_callbacks augment_callbacks = { - augment_propagate, - augment_copy, - augment_rotate, -}; - -/** - * drbd_insert_interval - insert a new interval into a tree - */ -bool -drbd_insert_interval(struct rb_root *root, struct drbd_interval *this) -{ - struct rb_node **new = &root->rb_node, *parent = NULL; - - BUG_ON(!IS_ALIGNED(this->size, 512)); - - while (*new) { - struct drbd_interval *here = - rb_entry(*new, struct drbd_interval, rb); - - parent = *new; - if (this->sector < here->sector) - new = &(*new)->rb_left; - else if (this->sector > here->sector) - new = &(*new)->rb_right; - else if (this < here) - new = &(*new)->rb_left; - else if (this > here) - new = &(*new)->rb_right; - else - return false; - } - - rb_link_node(&this->rb, parent, new); - rb_insert_augmented(&this->rb, root, &augment_callbacks); - return true; -} - -/** - * drbd_contains_interval - check if a tree contains a given interval - * @sector: start sector of @interval - * @interval: may not be a valid pointer - * - * Returns if the tree contains the node @interval with start sector @start. - * Does not dereference @interval until @interval is known to be a valid object - * in @tree. Returns %false if @interval is in the tree but with a different - * sector number. - */ -bool -drbd_contains_interval(struct rb_root *root, sector_t sector, - struct drbd_interval *interval) -{ - struct rb_node *node = root->rb_node; - - while (node) { - struct drbd_interval *here = - rb_entry(node, struct drbd_interval, rb); - - if (sector < here->sector) - node = node->rb_left; - else if (sector > here->sector) - node = node->rb_right; - else if (interval < here) - node = node->rb_left; - else if (interval > here) - node = node->rb_right; - else - return true; - } - return false; -} - -/** - * drbd_remove_interval - remove an interval from a tree - */ -void -drbd_remove_interval(struct rb_root *root, struct drbd_interval *this) -{ - rb_erase_augmented(&this->rb, root, &augment_callbacks); -} - -/** - * drbd_find_overlap - search for an interval overlapping with [sector, sector + size) - * @sector: start sector - * @size: size, aligned to 512 bytes - * - * Returns an interval overlapping with [sector, sector + size), or NULL if - * there is none. When there is more than one overlapping interval in the - * tree, the interval with the lowest start sector is returned, and all other - * overlapping intervals will be on the right side of the tree, reachable with - * rb_next(). - */ -struct drbd_interval * -drbd_find_overlap(struct rb_root *root, sector_t sector, unsigned int size) -{ - struct rb_node *node = root->rb_node; - struct drbd_interval *overlap = NULL; - sector_t end = sector + (size >> 9); - - BUG_ON(!IS_ALIGNED(size, 512)); - - while (node) { - struct drbd_interval *here = - rb_entry(node, struct drbd_interval, rb); - - if (node->rb_left && - sector < interval_end(node->rb_left)) { - /* Overlap if any must be on left side */ - node = node->rb_left; - } else if (here->sector < end && - sector < here->sector + (here->size >> 9)) { - overlap = here; - break; - } else if (sector >= here->sector) { - /* Overlap if any must be on right side */ - node = node->rb_right; - } else - break; - } - return overlap; -} - -struct drbd_interval * -drbd_next_overlap(struct drbd_interval *i, sector_t sector, unsigned int size) -{ - sector_t end = sector + (size >> 9); - struct rb_node *node; - - for (;;) { - node = rb_next(&i->rb); - if (!node) - return NULL; - i = rb_entry(node, struct drbd_interval, rb); - if (i->sector >= end) - return NULL; - if (sector < i->sector + (i->size >> 9)) - return i; - } -} diff --git a/trunk/drivers/block/drbd/drbd_interval.h b/trunk/drivers/block/drbd/drbd_interval.h deleted file mode 100644 index f38fcb00c10d..000000000000 --- a/trunk/drivers/block/drbd/drbd_interval.h +++ /dev/null @@ -1,40 +0,0 @@ -#ifndef __DRBD_INTERVAL_H -#define __DRBD_INTERVAL_H - -#include -#include - -struct drbd_interval { - struct rb_node rb; - sector_t sector; /* start sector of the interval */ - unsigned int size; /* size in bytes */ - sector_t end; /* highest interval end in subtree */ - int local:1 /* local or remote request? */; - int waiting:1; -}; - -static inline void drbd_clear_interval(struct drbd_interval *i) -{ - RB_CLEAR_NODE(&i->rb); -} - -static inline bool drbd_interval_empty(struct drbd_interval *i) -{ - return RB_EMPTY_NODE(&i->rb); -} - -extern bool drbd_insert_interval(struct rb_root *, struct drbd_interval *); -extern bool drbd_contains_interval(struct rb_root *, sector_t, - struct drbd_interval *); -extern void drbd_remove_interval(struct rb_root *, struct drbd_interval *); -extern struct drbd_interval *drbd_find_overlap(struct rb_root *, sector_t, - unsigned int); -extern struct drbd_interval *drbd_next_overlap(struct drbd_interval *, sector_t, - unsigned int); - -#define drbd_for_each_overlap(i, root, sector, size) \ - for (i = drbd_find_overlap(root, sector, size); \ - i; \ - i = drbd_next_overlap(i, sector, size)) - -#endif /* __DRBD_INTERVAL_H */ diff --git a/trunk/drivers/block/drbd/drbd_main.c b/trunk/drivers/block/drbd/drbd_main.c index 8c13eeb83c53..f55683ad4ffa 100644 --- a/trunk/drivers/block/drbd/drbd_main.c +++ b/trunk/drivers/block/drbd/drbd_main.c @@ -56,6 +56,14 @@ #include "drbd_vli.h" +struct after_state_chg_work { + struct drbd_work w; + union drbd_state os; + union drbd_state ns; + enum chg_state_flags flags; + struct completion *done; +}; + static DEFINE_MUTEX(drbd_main_mutex); int drbdd_init(struct drbd_thread *); int drbd_worker(struct drbd_thread *); @@ -64,17 +72,21 @@ int drbd_asender(struct drbd_thread *); int drbd_init(void); static int drbd_open(struct block_device *bdev, fmode_t mode); static int drbd_release(struct gendisk *gd, fmode_t mode); -static int w_md_sync(struct drbd_work *w, int unused); +static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused); +static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, + union drbd_state ns, enum chg_state_flags flags); +static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused); static void md_sync_timer_fn(unsigned long data); -static int w_bitmap_io(struct drbd_work *w, int unused); -static int w_go_diskless(struct drbd_work *w, int unused); +static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused); +static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused); +static void _tl_clear(struct drbd_conf *mdev); MODULE_AUTHOR("Philipp Reisner , " "Lars Ellenberg "); MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION); MODULE_VERSION(REL_VERSION); MODULE_LICENSE("GPL"); -MODULE_PARM_DESC(minor_count, "Approximate number of drbd devices (" +MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices (" __stringify(DRBD_MINOR_COUNT_MIN) "-" __stringify(DRBD_MINOR_COUNT_MAX) ")"); MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR); @@ -86,6 +98,7 @@ MODULE_PARM_DESC(allow_oos, "DONT USE!"); module_param(minor_count, uint, 0444); module_param(disable_sendpage, bool, 0644); module_param(allow_oos, bool, 0); +module_param(cn_idx, uint, 0444); module_param(proc_details, int, 0644); #ifdef CONFIG_DRBD_FAULT_INJECTION @@ -107,6 +120,7 @@ module_param(fault_devs, int, 0644); unsigned int minor_count = DRBD_MINOR_COUNT_DEF; bool disable_sendpage; bool allow_oos; +unsigned int cn_idx = CN_IDX_DRBD; int proc_details; /* Detail level in proc drbd*/ /* Module parameter for setting the user mode helper program @@ -118,11 +132,10 @@ module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0 /* in 2.6.x, our device mapping and config info contains our virtual gendisks * as member "struct gendisk *vdisk;" */ -struct idr minors; -struct list_head drbd_tconns; /* list of struct drbd_tconn */ +struct drbd_conf **minor_table; struct kmem_cache *drbd_request_cache; -struct kmem_cache *drbd_ee_cache; /* peer requests */ +struct kmem_cache *drbd_ee_cache; /* epoch entries */ struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */ struct kmem_cache *drbd_al_ext_cache; /* activity log extents */ mempool_t *drbd_request_mempool; @@ -151,15 +164,10 @@ static const struct block_device_operations drbd_ops = { struct bio *bio_alloc_drbd(gfp_t gfp_mask) { - struct bio *bio; - if (!drbd_md_io_bio_set) return bio_alloc(gfp_mask, 1); - bio = bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set); - if (!bio) - return NULL; - return bio; + return bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set); } #ifdef __CHECKER__ @@ -179,261 +187,1673 @@ int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins) return io_allowed; } -#endif +#endif + +/** + * DOC: The transfer log + * + * The transfer log is a single linked list of &struct drbd_tl_epoch objects. + * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail + * of the list. There is always at least one &struct drbd_tl_epoch object. + * + * Each &struct drbd_tl_epoch has a circular double linked list of requests + * attached. + */ +static int tl_init(struct drbd_conf *mdev) +{ + struct drbd_tl_epoch *b; + + /* during device minor initialization, we may well use GFP_KERNEL */ + b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL); + if (!b) + return 0; + INIT_LIST_HEAD(&b->requests); + INIT_LIST_HEAD(&b->w.list); + b->next = NULL; + b->br_number = 4711; + b->n_writes = 0; + b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */ + + mdev->oldest_tle = b; + mdev->newest_tle = b; + INIT_LIST_HEAD(&mdev->out_of_sequence_requests); + INIT_LIST_HEAD(&mdev->barrier_acked_requests); + + mdev->tl_hash = NULL; + mdev->tl_hash_s = 0; + + return 1; +} + +static void tl_cleanup(struct drbd_conf *mdev) +{ + D_ASSERT(mdev->oldest_tle == mdev->newest_tle); + D_ASSERT(list_empty(&mdev->out_of_sequence_requests)); + kfree(mdev->oldest_tle); + mdev->oldest_tle = NULL; + kfree(mdev->unused_spare_tle); + mdev->unused_spare_tle = NULL; + kfree(mdev->tl_hash); + mdev->tl_hash = NULL; + mdev->tl_hash_s = 0; +} + +/** + * _tl_add_barrier() - Adds a barrier to the transfer log + * @mdev: DRBD device. + * @new: Barrier to be added before the current head of the TL. + * + * The caller must hold the req_lock. + */ +void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new) +{ + struct drbd_tl_epoch *newest_before; + + INIT_LIST_HEAD(&new->requests); + INIT_LIST_HEAD(&new->w.list); + new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */ + new->next = NULL; + new->n_writes = 0; + + newest_before = mdev->newest_tle; + new->br_number = newest_before->br_number+1; + if (mdev->newest_tle != new) { + mdev->newest_tle->next = new; + mdev->newest_tle = new; + } +} + +/** + * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL + * @mdev: DRBD device. + * @barrier_nr: Expected identifier of the DRBD write barrier packet. + * @set_size: Expected number of requests before that barrier. + * + * In case the passed barrier_nr or set_size does not match the oldest + * &struct drbd_tl_epoch objects this function will cause a termination + * of the connection. + */ +void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr, + unsigned int set_size) +{ + struct drbd_tl_epoch *b, *nob; /* next old barrier */ + struct list_head *le, *tle; + struct drbd_request *r; + + spin_lock_irq(&mdev->req_lock); + + b = mdev->oldest_tle; + + /* first some paranoia code */ + if (b == NULL) { + dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n", + barrier_nr); + goto bail; + } + if (b->br_number != barrier_nr) { + dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n", + barrier_nr, b->br_number); + goto bail; + } + if (b->n_writes != set_size) { + dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n", + barrier_nr, set_size, b->n_writes); + goto bail; + } + + /* Clean up list of requests processed during current epoch */ + list_for_each_safe(le, tle, &b->requests) { + r = list_entry(le, struct drbd_request, tl_requests); + _req_mod(r, barrier_acked); + } + /* There could be requests on the list waiting for completion + of the write to the local disk. To avoid corruptions of + slab's data structures we have to remove the lists head. + + Also there could have been a barrier ack out of sequence, overtaking + the write acks - which would be a bug and violating write ordering. + To not deadlock in case we lose connection while such requests are + still pending, we need some way to find them for the + _req_mode(connection_lost_while_pending). + + These have been list_move'd to the out_of_sequence_requests list in + _req_mod(, barrier_acked) above. + */ + list_splice_init(&b->requests, &mdev->barrier_acked_requests); + + nob = b->next; + if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) { + _tl_add_barrier(mdev, b); + if (nob) + mdev->oldest_tle = nob; + /* if nob == NULL b was the only barrier, and becomes the new + barrier. Therefore mdev->oldest_tle points already to b */ + } else { + D_ASSERT(nob != NULL); + mdev->oldest_tle = nob; + kfree(b); + } + + spin_unlock_irq(&mdev->req_lock); + dec_ap_pending(mdev); + + return; + +bail: + spin_unlock_irq(&mdev->req_lock); + drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); +} + + +/** + * _tl_restart() - Walks the transfer log, and applies an action to all requests + * @mdev: DRBD device. + * @what: The action/event to perform with all request objects + * + * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io, + * restart_frozen_disk_io. + */ +static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) +{ + struct drbd_tl_epoch *b, *tmp, **pn; + struct list_head *le, *tle, carry_reads; + struct drbd_request *req; + int rv, n_writes, n_reads; + + b = mdev->oldest_tle; + pn = &mdev->oldest_tle; + while (b) { + n_writes = 0; + n_reads = 0; + INIT_LIST_HEAD(&carry_reads); + list_for_each_safe(le, tle, &b->requests) { + req = list_entry(le, struct drbd_request, tl_requests); + rv = _req_mod(req, what); + + n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT; + n_reads += (rv & MR_READ) >> MR_READ_SHIFT; + } + tmp = b->next; + + if (n_writes) { + if (what == resend) { + b->n_writes = n_writes; + if (b->w.cb == NULL) { + b->w.cb = w_send_barrier; + inc_ap_pending(mdev); + set_bit(CREATE_BARRIER, &mdev->flags); + } + + drbd_queue_work(&mdev->data.work, &b->w); + } + pn = &b->next; + } else { + if (n_reads) + list_add(&carry_reads, &b->requests); + /* there could still be requests on that ring list, + * in case local io is still pending */ + list_del(&b->requests); + + /* dec_ap_pending corresponding to queue_barrier. + * the newest barrier may not have been queued yet, + * in which case w.cb is still NULL. */ + if (b->w.cb != NULL) + dec_ap_pending(mdev); + + if (b == mdev->newest_tle) { + /* recycle, but reinit! */ + D_ASSERT(tmp == NULL); + INIT_LIST_HEAD(&b->requests); + list_splice(&carry_reads, &b->requests); + INIT_LIST_HEAD(&b->w.list); + b->w.cb = NULL; + b->br_number = net_random(); + b->n_writes = 0; + + *pn = b; + break; + } + *pn = tmp; + kfree(b); + } + b = tmp; + list_splice(&carry_reads, &b->requests); + } + + /* Actions operating on the disk state, also want to work on + requests that got barrier acked. */ + + list_for_each_safe(le, tle, &mdev->barrier_acked_requests) { + req = list_entry(le, struct drbd_request, tl_requests); + _req_mod(req, what); + } +} + + +/** + * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL + * @mdev: DRBD device. + * + * This is called after the connection to the peer was lost. The storage covered + * by the requests on the transfer gets marked as our of sync. Called from the + * receiver thread and the worker thread. + */ +void tl_clear(struct drbd_conf *mdev) +{ + spin_lock_irq(&mdev->req_lock); + _tl_clear(mdev); + spin_unlock_irq(&mdev->req_lock); +} + +static void _tl_clear(struct drbd_conf *mdev) +{ + struct list_head *le, *tle; + struct drbd_request *r; + + _tl_restart(mdev, connection_lost_while_pending); + + /* we expect this list to be empty. */ + D_ASSERT(list_empty(&mdev->out_of_sequence_requests)); + + /* but just in case, clean it up anyways! */ + list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) { + r = list_entry(le, struct drbd_request, tl_requests); + /* It would be nice to complete outside of spinlock. + * But this is easier for now. */ + _req_mod(r, connection_lost_while_pending); + } + + /* ensure bit indicating barrier is required is clear */ + clear_bit(CREATE_BARRIER, &mdev->flags); + + memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *)); + +} + +void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) +{ + spin_lock_irq(&mdev->req_lock); + _tl_restart(mdev, what); + spin_unlock_irq(&mdev->req_lock); +} + +/** + * tl_abort_disk_io() - Abort disk I/O for all requests for a certain mdev in the TL + * @mdev: DRBD device. + */ +void tl_abort_disk_io(struct drbd_conf *mdev) +{ + struct drbd_tl_epoch *b; + struct list_head *le, *tle; + struct drbd_request *req; + + spin_lock_irq(&mdev->req_lock); + b = mdev->oldest_tle; + while (b) { + list_for_each_safe(le, tle, &b->requests) { + req = list_entry(le, struct drbd_request, tl_requests); + if (!(req->rq_state & RQ_LOCAL_PENDING)) + continue; + _req_mod(req, abort_disk_io); + } + b = b->next; + } + + list_for_each_safe(le, tle, &mdev->barrier_acked_requests) { + req = list_entry(le, struct drbd_request, tl_requests); + if (!(req->rq_state & RQ_LOCAL_PENDING)) + continue; + _req_mod(req, abort_disk_io); + } + + spin_unlock_irq(&mdev->req_lock); +} + +/** + * cl_wide_st_chg() - true if the state change is a cluster wide one + * @mdev: DRBD device. + * @os: old (current) state. + * @ns: new (wanted) state. + */ +static int cl_wide_st_chg(struct drbd_conf *mdev, + union drbd_state os, union drbd_state ns) +{ + return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED && + ((os.role != R_PRIMARY && ns.role == R_PRIMARY) || + (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || + (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) || + (os.disk != D_FAILED && ns.disk == D_FAILED))) || + (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) || + (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S); +} + +enum drbd_state_rv +drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f, + union drbd_state mask, union drbd_state val) +{ + unsigned long flags; + union drbd_state os, ns; + enum drbd_state_rv rv; + + spin_lock_irqsave(&mdev->req_lock, flags); + os = mdev->state; + ns.i = (os.i & ~mask.i) | val.i; + rv = _drbd_set_state(mdev, ns, f, NULL); + ns = mdev->state; + spin_unlock_irqrestore(&mdev->req_lock, flags); + + return rv; +} + +/** + * drbd_force_state() - Impose a change which happens outside our control on our state + * @mdev: DRBD device. + * @mask: mask of state bits to change. + * @val: value of new state bits. + */ +void drbd_force_state(struct drbd_conf *mdev, + union drbd_state mask, union drbd_state val) +{ + drbd_change_state(mdev, CS_HARD, mask, val); +} + +static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state); +static enum drbd_state_rv is_valid_state_transition(struct drbd_conf *, + union drbd_state, + union drbd_state); +enum sanitize_state_warnings { + NO_WARNING, + ABORTED_ONLINE_VERIFY, + ABORTED_RESYNC, + CONNECTION_LOST_NEGOTIATING, + IMPLICITLY_UPGRADED_DISK, + IMPLICITLY_UPGRADED_PDSK, +}; +static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os, + union drbd_state ns, enum sanitize_state_warnings *warn); +int drbd_send_state_req(struct drbd_conf *, + union drbd_state, union drbd_state); + +static enum drbd_state_rv +_req_st_cond(struct drbd_conf *mdev, union drbd_state mask, + union drbd_state val) +{ + union drbd_state os, ns; + unsigned long flags; + enum drbd_state_rv rv; + + if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags)) + return SS_CW_SUCCESS; + + if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags)) + return SS_CW_FAILED_BY_PEER; + + rv = 0; + spin_lock_irqsave(&mdev->req_lock, flags); + os = mdev->state; + ns.i = (os.i & ~mask.i) | val.i; + ns = sanitize_state(mdev, os, ns, NULL); + + if (!cl_wide_st_chg(mdev, os, ns)) + rv = SS_CW_NO_NEED; + if (!rv) { + rv = is_valid_state(mdev, ns); + if (rv == SS_SUCCESS) { + rv = is_valid_state_transition(mdev, ns, os); + if (rv == SS_SUCCESS) + rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */ + } + } + spin_unlock_irqrestore(&mdev->req_lock, flags); + + return rv; +} + +/** + * drbd_req_state() - Perform an eventually cluster wide state change + * @mdev: DRBD device. + * @mask: mask of state bits to change. + * @val: value of new state bits. + * @f: flags + * + * Should not be called directly, use drbd_request_state() or + * _drbd_request_state(). + */ +static enum drbd_state_rv +drbd_req_state(struct drbd_conf *mdev, union drbd_state mask, + union drbd_state val, enum chg_state_flags f) +{ + struct completion done; + unsigned long flags; + union drbd_state os, ns; + enum drbd_state_rv rv; + + init_completion(&done); + + if (f & CS_SERIALIZE) + mutex_lock(&mdev->state_mutex); + + spin_lock_irqsave(&mdev->req_lock, flags); + os = mdev->state; + ns.i = (os.i & ~mask.i) | val.i; + ns = sanitize_state(mdev, os, ns, NULL); + + if (cl_wide_st_chg(mdev, os, ns)) { + rv = is_valid_state(mdev, ns); + if (rv == SS_SUCCESS) + rv = is_valid_state_transition(mdev, ns, os); + spin_unlock_irqrestore(&mdev->req_lock, flags); + + if (rv < SS_SUCCESS) { + if (f & CS_VERBOSE) + print_st_err(mdev, os, ns, rv); + goto abort; + } + + drbd_state_lock(mdev); + if (!drbd_send_state_req(mdev, mask, val)) { + drbd_state_unlock(mdev); + rv = SS_CW_FAILED_BY_PEER; + if (f & CS_VERBOSE) + print_st_err(mdev, os, ns, rv); + goto abort; + } + + wait_event(mdev->state_wait, + (rv = _req_st_cond(mdev, mask, val))); + + if (rv < SS_SUCCESS) { + drbd_state_unlock(mdev); + if (f & CS_VERBOSE) + print_st_err(mdev, os, ns, rv); + goto abort; + } + spin_lock_irqsave(&mdev->req_lock, flags); + os = mdev->state; + ns.i = (os.i & ~mask.i) | val.i; + rv = _drbd_set_state(mdev, ns, f, &done); + drbd_state_unlock(mdev); + } else { + rv = _drbd_set_state(mdev, ns, f, &done); + } + + spin_unlock_irqrestore(&mdev->req_lock, flags); + + if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) { + D_ASSERT(current != mdev->worker.task); + wait_for_completion(&done); + } + +abort: + if (f & CS_SERIALIZE) + mutex_unlock(&mdev->state_mutex); + + return rv; +} + +/** + * _drbd_request_state() - Request a state change (with flags) + * @mdev: DRBD device. + * @mask: mask of state bits to change. + * @val: value of new state bits. + * @f: flags + * + * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE + * flag, or when logging of failed state change requests is not desired. + */ +enum drbd_state_rv +_drbd_request_state(struct drbd_conf *mdev, union drbd_state mask, + union drbd_state val, enum chg_state_flags f) +{ + enum drbd_state_rv rv; + + wait_event(mdev->state_wait, + (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE); + + return rv; +} + +static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns) +{ + dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n", + name, + drbd_conn_str(ns.conn), + drbd_role_str(ns.role), + drbd_role_str(ns.peer), + drbd_disk_str(ns.disk), + drbd_disk_str(ns.pdsk), + is_susp(ns) ? 's' : 'r', + ns.aftr_isp ? 'a' : '-', + ns.peer_isp ? 'p' : '-', + ns.user_isp ? 'u' : '-' + ); +} + +void print_st_err(struct drbd_conf *mdev, union drbd_state os, + union drbd_state ns, enum drbd_state_rv err) +{ + if (err == SS_IN_TRANSIENT_STATE) + return; + dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err)); + print_st(mdev, " state", os); + print_st(mdev, "wanted", ns); +} + + +/** + * is_valid_state() - Returns an SS_ error code if ns is not valid + * @mdev: DRBD device. + * @ns: State to consider. + */ +static enum drbd_state_rv +is_valid_state(struct drbd_conf *mdev, union drbd_state ns) +{ + /* See drbd_state_sw_errors in drbd_strings.c */ + + enum drbd_fencing_p fp; + enum drbd_state_rv rv = SS_SUCCESS; + + fp = FP_DONT_CARE; + if (get_ldev(mdev)) { + fp = mdev->ldev->dc.fencing; + put_ldev(mdev); + } + + if (get_net_conf(mdev)) { + if (!mdev->net_conf->two_primaries && + ns.role == R_PRIMARY && ns.peer == R_PRIMARY) + rv = SS_TWO_PRIMARIES; + put_net_conf(mdev); + } + + if (rv <= 0) + /* already found a reason to abort */; + else if (ns.role == R_SECONDARY && mdev->open_cnt) + rv = SS_DEVICE_IN_USE; + + else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE) + rv = SS_NO_UP_TO_DATE_DISK; + + else if (fp >= FP_RESOURCE && + ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN) + rv = SS_PRIMARY_NOP; + + else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT) + rv = SS_NO_UP_TO_DATE_DISK; + + else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT) + rv = SS_NO_LOCAL_DISK; + + else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT) + rv = SS_NO_REMOTE_DISK; + + else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) + rv = SS_NO_UP_TO_DATE_DISK; + + else if ((ns.conn == C_CONNECTED || + ns.conn == C_WF_BITMAP_S || + ns.conn == C_SYNC_SOURCE || + ns.conn == C_PAUSED_SYNC_S) && + ns.disk == D_OUTDATED) + rv = SS_CONNECTED_OUTDATES; + + else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && + (mdev->sync_conf.verify_alg[0] == 0)) + rv = SS_NO_VERIFY_ALG; + + else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && + mdev->agreed_pro_version < 88) + rv = SS_NOT_SUPPORTED; + + else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN) + rv = SS_CONNECTED_OUTDATES; + + return rv; +} + +/** + * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible + * @mdev: DRBD device. + * @ns: new state. + * @os: old state. + */ +static enum drbd_state_rv +is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns, + union drbd_state os) +{ + enum drbd_state_rv rv = SS_SUCCESS; + + if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) && + os.conn > C_CONNECTED) + rv = SS_RESYNC_RUNNING; + + if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE) + rv = SS_ALREADY_STANDALONE; + + if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS) + rv = SS_IS_DISKLESS; + + if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED) + rv = SS_NO_NET_CONFIG; + + if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING) + rv = SS_LOWER_THAN_OUTDATED; + + if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED) + rv = SS_IN_TRANSIENT_STATE; + + if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS) + rv = SS_IN_TRANSIENT_STATE; + + /* While establishing a connection only allow cstate to change. + Delay/refuse role changes, detach attach etc... */ + if (test_bit(STATE_SENT, &mdev->flags) && + !(os.conn == C_WF_REPORT_PARAMS || + (ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION))) + rv = SS_IN_TRANSIENT_STATE; + + if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED) + rv = SS_NEED_CONNECTION; + + if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && + ns.conn != os.conn && os.conn > C_CONNECTED) + rv = SS_RESYNC_RUNNING; + + if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) && + os.conn < C_CONNECTED) + rv = SS_NEED_CONNECTION; + + if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE) + && os.conn < C_WF_REPORT_PARAMS) + rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */ + + return rv; +} + +static void print_sanitize_warnings(struct drbd_conf *mdev, enum sanitize_state_warnings warn) +{ + static const char *msg_table[] = { + [NO_WARNING] = "", + [ABORTED_ONLINE_VERIFY] = "Online-verify aborted.", + [ABORTED_RESYNC] = "Resync aborted.", + [CONNECTION_LOST_NEGOTIATING] = "Connection lost while negotiating, no data!", + [IMPLICITLY_UPGRADED_DISK] = "Implicitly upgraded disk", + [IMPLICITLY_UPGRADED_PDSK] = "Implicitly upgraded pdsk", + }; + + if (warn != NO_WARNING) + dev_warn(DEV, "%s\n", msg_table[warn]); +} + +/** + * sanitize_state() - Resolves implicitly necessary additional changes to a state transition + * @mdev: DRBD device. + * @os: old state. + * @ns: new state. + * @warn_sync_abort: + * + * When we loose connection, we have to set the state of the peers disk (pdsk) + * to D_UNKNOWN. This rule and many more along those lines are in this function. + */ +static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os, + union drbd_state ns, enum sanitize_state_warnings *warn) +{ + enum drbd_fencing_p fp; + enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max; + + if (warn) + *warn = NO_WARNING; + + fp = FP_DONT_CARE; + if (get_ldev(mdev)) { + fp = mdev->ldev->dc.fencing; + put_ldev(mdev); + } + + /* Disallow Network errors to configure a device's network part */ + if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) && + os.conn <= C_DISCONNECTING) + ns.conn = os.conn; + + /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow. + * If you try to go into some Sync* state, that shall fail (elsewhere). */ + if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN && + ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_CONNECTED) + ns.conn = os.conn; + + /* we cannot fail (again) if we already detached */ + if (ns.disk == D_FAILED && os.disk == D_DISKLESS) + ns.disk = D_DISKLESS; + + /* After C_DISCONNECTING only C_STANDALONE may follow */ + if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE) + ns.conn = os.conn; + + if (ns.conn < C_CONNECTED) { + ns.peer_isp = 0; + ns.peer = R_UNKNOWN; + if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT) + ns.pdsk = D_UNKNOWN; + } + + /* Clear the aftr_isp when becoming unconfigured */ + if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY) + ns.aftr_isp = 0; + + /* Abort resync if a disk fails/detaches */ + if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED && + (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) { + if (warn) + *warn = os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ? + ABORTED_ONLINE_VERIFY : ABORTED_RESYNC; + ns.conn = C_CONNECTED; + } + + /* Connection breaks down before we finished "Negotiating" */ + if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING && + get_ldev_if_state(mdev, D_NEGOTIATING)) { + if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) { + ns.disk = mdev->new_state_tmp.disk; + ns.pdsk = mdev->new_state_tmp.pdsk; + } else { + if (warn) + *warn = CONNECTION_LOST_NEGOTIATING; + ns.disk = D_DISKLESS; + ns.pdsk = D_UNKNOWN; + } + put_ldev(mdev); + } + + /* D_CONSISTENT and D_OUTDATED vanish when we get connected */ + if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) { + if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED) + ns.disk = D_UP_TO_DATE; + if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED) + ns.pdsk = D_UP_TO_DATE; + } + + /* Implications of the connection stat on the disk states */ + disk_min = D_DISKLESS; + disk_max = D_UP_TO_DATE; + pdsk_min = D_INCONSISTENT; + pdsk_max = D_UNKNOWN; + switch ((enum drbd_conns)ns.conn) { + case C_WF_BITMAP_T: + case C_PAUSED_SYNC_T: + case C_STARTING_SYNC_T: + case C_WF_SYNC_UUID: + case C_BEHIND: + disk_min = D_INCONSISTENT; + disk_max = D_OUTDATED; + pdsk_min = D_UP_TO_DATE; + pdsk_max = D_UP_TO_DATE; + break; + case C_VERIFY_S: + case C_VERIFY_T: + disk_min = D_UP_TO_DATE; + disk_max = D_UP_TO_DATE; + pdsk_min = D_UP_TO_DATE; + pdsk_max = D_UP_TO_DATE; + break; + case C_CONNECTED: + disk_min = D_DISKLESS; + disk_max = D_UP_TO_DATE; + pdsk_min = D_DISKLESS; + pdsk_max = D_UP_TO_DATE; + break; + case C_WF_BITMAP_S: + case C_PAUSED_SYNC_S: + case C_STARTING_SYNC_S: + case C_AHEAD: + disk_min = D_UP_TO_DATE; + disk_max = D_UP_TO_DATE; + pdsk_min = D_INCONSISTENT; + pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/ + break; + case C_SYNC_TARGET: + disk_min = D_INCONSISTENT; + disk_max = D_INCONSISTENT; + pdsk_min = D_UP_TO_DATE; + pdsk_max = D_UP_TO_DATE; + break; + case C_SYNC_SOURCE: + disk_min = D_UP_TO_DATE; + disk_max = D_UP_TO_DATE; + pdsk_min = D_INCONSISTENT; + pdsk_max = D_INCONSISTENT; + break; + case C_STANDALONE: + case C_DISCONNECTING: + case C_UNCONNECTED: + case C_TIMEOUT: + case C_BROKEN_PIPE: + case C_NETWORK_FAILURE: + case C_PROTOCOL_ERROR: + case C_TEAR_DOWN: + case C_WF_CONNECTION: + case C_WF_REPORT_PARAMS: + case C_MASK: + break; + } + if (ns.disk > disk_max) + ns.disk = disk_max; + + if (ns.disk < disk_min) { + if (warn) + *warn = IMPLICITLY_UPGRADED_DISK; + ns.disk = disk_min; + } + if (ns.pdsk > pdsk_max) + ns.pdsk = pdsk_max; + + if (ns.pdsk < pdsk_min) { + if (warn) + *warn = IMPLICITLY_UPGRADED_PDSK; + ns.pdsk = pdsk_min; + } + + if (fp == FP_STONITH && + (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) && + !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED)) + ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */ + + if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO && + (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) && + !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE)) + ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */ + + if (ns.aftr_isp || ns.peer_isp || ns.user_isp) { + if (ns.conn == C_SYNC_SOURCE) + ns.conn = C_PAUSED_SYNC_S; + if (ns.conn == C_SYNC_TARGET) + ns.conn = C_PAUSED_SYNC_T; + } else { + if (ns.conn == C_PAUSED_SYNC_S) + ns.conn = C_SYNC_SOURCE; + if (ns.conn == C_PAUSED_SYNC_T) + ns.conn = C_SYNC_TARGET; + } + + return ns; +} + +/* helper for __drbd_set_state */ +static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs) +{ + if (mdev->agreed_pro_version < 90) + mdev->ov_start_sector = 0; + mdev->rs_total = drbd_bm_bits(mdev); + mdev->ov_position = 0; + if (cs == C_VERIFY_T) { + /* starting online verify from an arbitrary position + * does not fit well into the existing protocol. + * on C_VERIFY_T, we initialize ov_left and friends + * implicitly in receive_DataRequest once the + * first P_OV_REQUEST is received */ + mdev->ov_start_sector = ~(sector_t)0; + } else { + unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector); + if (bit >= mdev->rs_total) { + mdev->ov_start_sector = + BM_BIT_TO_SECT(mdev->rs_total - 1); + mdev->rs_total = 1; + } else + mdev->rs_total -= bit; + mdev->ov_position = mdev->ov_start_sector; + } + mdev->ov_left = mdev->rs_total; +} + +static void drbd_resume_al(struct drbd_conf *mdev) +{ + if (test_and_clear_bit(AL_SUSPENDED, &mdev->flags)) + dev_info(DEV, "Resumed AL updates\n"); +} + +/** + * __drbd_set_state() - Set a new DRBD state + * @mdev: DRBD device. + * @ns: new state. + * @flags: Flags + * @done: Optional completion, that will get completed after the after_state_ch() finished + * + * Caller needs to hold req_lock, and global_state_lock. Do not call directly. + */ +enum drbd_state_rv +__drbd_set_state(struct drbd_conf *mdev, union drbd_state ns, + enum chg_state_flags flags, struct completion *done) +{ + union drbd_state os; + enum drbd_state_rv rv = SS_SUCCESS; + enum sanitize_state_warnings ssw; + struct after_state_chg_work *ascw; + + os = mdev->state; + + ns = sanitize_state(mdev, os, ns, &ssw); + + if (ns.i == os.i) + return SS_NOTHING_TO_DO; + + if (!(flags & CS_HARD)) { + /* pre-state-change checks ; only look at ns */ + /* See drbd_state_sw_errors in drbd_strings.c */ + + rv = is_valid_state(mdev, ns); + if (rv < SS_SUCCESS) { + /* If the old state was illegal as well, then let + this happen...*/ + + if (is_valid_state(mdev, os) == rv) + rv = is_valid_state_transition(mdev, ns, os); + } else + rv = is_valid_state_transition(mdev, ns, os); + } + + if (rv < SS_SUCCESS) { + if (flags & CS_VERBOSE) + print_st_err(mdev, os, ns, rv); + return rv; + } + + print_sanitize_warnings(mdev, ssw); + + { + char *pbp, pb[300]; + pbp = pb; + *pbp = 0; + if (ns.role != os.role) + pbp += sprintf(pbp, "role( %s -> %s ) ", + drbd_role_str(os.role), + drbd_role_str(ns.role)); + if (ns.peer != os.peer) + pbp += sprintf(pbp, "peer( %s -> %s ) ", + drbd_role_str(os.peer), + drbd_role_str(ns.peer)); + if (ns.conn != os.conn) + pbp += sprintf(pbp, "conn( %s -> %s ) ", + drbd_conn_str(os.conn), + drbd_conn_str(ns.conn)); + if (ns.disk != os.disk) + pbp += sprintf(pbp, "disk( %s -> %s ) ", + drbd_disk_str(os.disk), + drbd_disk_str(ns.disk)); + if (ns.pdsk != os.pdsk) + pbp += sprintf(pbp, "pdsk( %s -> %s ) ", + drbd_disk_str(os.pdsk), + drbd_disk_str(ns.pdsk)); + if (is_susp(ns) != is_susp(os)) + pbp += sprintf(pbp, "susp( %d -> %d ) ", + is_susp(os), + is_susp(ns)); + if (ns.aftr_isp != os.aftr_isp) + pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ", + os.aftr_isp, + ns.aftr_isp); + if (ns.peer_isp != os.peer_isp) + pbp += sprintf(pbp, "peer_isp( %d -> %d ) ", + os.peer_isp, + ns.peer_isp); + if (ns.user_isp != os.user_isp) + pbp += sprintf(pbp, "user_isp( %d -> %d ) ", + os.user_isp, + ns.user_isp); + dev_info(DEV, "%s\n", pb); + } + + /* solve the race between becoming unconfigured, + * worker doing the cleanup, and + * admin reconfiguring us: + * on (re)configure, first set CONFIG_PENDING, + * then wait for a potentially exiting worker, + * start the worker, and schedule one no_op. + * then proceed with configuration. + */ + if (ns.disk == D_DISKLESS && + ns.conn == C_STANDALONE && + ns.role == R_SECONDARY && + !test_and_set_bit(CONFIG_PENDING, &mdev->flags)) + set_bit(DEVICE_DYING, &mdev->flags); + + /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference + * on the ldev here, to be sure the transition -> D_DISKLESS resp. + * drbd_ldev_destroy() won't happen before our corresponding + * after_state_ch works run, where we put_ldev again. */ + if ((os.disk != D_FAILED && ns.disk == D_FAILED) || + (os.disk != D_DISKLESS && ns.disk == D_DISKLESS)) + atomic_inc(&mdev->local_cnt); + + mdev->state = ns; + + if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING) + drbd_print_uuids(mdev, "attached to UUIDs"); + + wake_up(&mdev->misc_wait); + wake_up(&mdev->state_wait); + + /* aborted verify run. log the last position */ + if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) && + ns.conn < C_CONNECTED) { + mdev->ov_start_sector = + BM_BIT_TO_SECT(drbd_bm_bits(mdev) - mdev->ov_left); + dev_info(DEV, "Online Verify reached sector %llu\n", + (unsigned long long)mdev->ov_start_sector); + } + + if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) && + (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) { + dev_info(DEV, "Syncer continues.\n"); + mdev->rs_paused += (long)jiffies + -(long)mdev->rs_mark_time[mdev->rs_last_mark]; + if (ns.conn == C_SYNC_TARGET) + mod_timer(&mdev->resync_timer, jiffies); + } + + if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) && + (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) { + dev_info(DEV, "Resync suspended\n"); + mdev->rs_mark_time[mdev->rs_last_mark] = jiffies; + } + + if (os.conn == C_CONNECTED && + (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) { + unsigned long now = jiffies; + int i; + + set_ov_position(mdev, ns.conn); + mdev->rs_start = now; + mdev->rs_last_events = 0; + mdev->rs_last_sect_ev = 0; + mdev->ov_last_oos_size = 0; + mdev->ov_last_oos_start = 0; + + for (i = 0; i < DRBD_SYNC_MARKS; i++) { + mdev->rs_mark_left[i] = mdev->ov_left; + mdev->rs_mark_time[i] = now; + } + + drbd_rs_controller_reset(mdev); + + if (ns.conn == C_VERIFY_S) { + dev_info(DEV, "Starting Online Verify from sector %llu\n", + (unsigned long long)mdev->ov_position); + mod_timer(&mdev->resync_timer, jiffies); + } + } + + if (get_ldev(mdev)) { + u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND| + MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE| + MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY); + + if (test_bit(CRASHED_PRIMARY, &mdev->flags)) + mdf |= MDF_CRASHED_PRIMARY; + if (mdev->state.role == R_PRIMARY || + (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY)) + mdf |= MDF_PRIMARY_IND; + if (mdev->state.conn > C_WF_REPORT_PARAMS) + mdf |= MDF_CONNECTED_IND; + if (mdev->state.disk > D_INCONSISTENT) + mdf |= MDF_CONSISTENT; + if (mdev->state.disk > D_OUTDATED) + mdf |= MDF_WAS_UP_TO_DATE; + if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT) + mdf |= MDF_PEER_OUT_DATED; + if (mdf != mdev->ldev->md.flags) { + mdev->ldev->md.flags = mdf; + drbd_md_mark_dirty(mdev); + } + if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT) + drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]); + put_ldev(mdev); + } + + /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */ + if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT && + os.peer == R_SECONDARY && ns.peer == R_PRIMARY) + set_bit(CONSIDER_RESYNC, &mdev->flags); + + /* Receiver should clean up itself */ + if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING) + drbd_thread_stop_nowait(&mdev->receiver); + + /* Now the receiver finished cleaning up itself, it should die */ + if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE) + drbd_thread_stop_nowait(&mdev->receiver); + + /* Upon network failure, we need to restart the receiver. */ + if (os.conn > C_WF_CONNECTION && + ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT) + drbd_thread_restart_nowait(&mdev->receiver); + + /* Resume AL writing if we get a connection */ + if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) + drbd_resume_al(mdev); + + /* remember last connect and attach times so request_timer_fn() won't + * kill newly established sessions while we are still trying to thaw + * previously frozen IO */ + if (os.conn != C_WF_REPORT_PARAMS && ns.conn == C_WF_REPORT_PARAMS) + mdev->last_reconnect_jif = jiffies; + if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) && + ns.disk > D_NEGOTIATING) + mdev->last_reattach_jif = jiffies; + + ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC); + if (ascw) { + ascw->os = os; + ascw->ns = ns; + ascw->flags = flags; + ascw->w.cb = w_after_state_ch; + ascw->done = done; + drbd_queue_work(&mdev->data.work, &ascw->w); + } else { + dev_warn(DEV, "Could not kmalloc an ascw\n"); + } + + return rv; +} + +static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused) +{ + struct after_state_chg_work *ascw = + container_of(w, struct after_state_chg_work, w); + after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags); + if (ascw->flags & CS_WAIT_COMPLETE) { + D_ASSERT(ascw->done != NULL); + complete(ascw->done); + } + kfree(ascw); + + return 1; +} + +static void abw_start_sync(struct drbd_conf *mdev, int rv) +{ + if (rv) { + dev_err(DEV, "Writing the bitmap failed not starting resync.\n"); + _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE); + return; + } + + switch (mdev->state.conn) { + case C_STARTING_SYNC_T: + _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE); + break; + case C_STARTING_SYNC_S: + drbd_start_resync(mdev, C_SYNC_SOURCE); + break; + } +} + +int drbd_bitmap_io_from_worker(struct drbd_conf *mdev, + int (*io_fn)(struct drbd_conf *), + char *why, enum bm_flag flags) +{ + int rv; + + D_ASSERT(current == mdev->worker.task); + + /* open coded non-blocking drbd_suspend_io(mdev); */ + set_bit(SUSPEND_IO, &mdev->flags); + + drbd_bm_lock(mdev, why, flags); + rv = io_fn(mdev); + drbd_bm_unlock(mdev); + + drbd_resume_io(mdev); + + return rv; +} /** - * tl_release() - mark as BARRIER_ACKED all requests in the corresponding transfer log epoch - * @tconn: DRBD connection. - * @barrier_nr: Expected identifier of the DRBD write barrier packet. - * @set_size: Expected number of requests before that barrier. - * - * In case the passed barrier_nr or set_size does not match the oldest - * epoch of not yet barrier-acked requests, this function will cause a - * termination of the connection. + * after_state_ch() - Perform after state change actions that may sleep + * @mdev: DRBD device. + * @os: old state. + * @ns: new state. + * @flags: Flags */ -void tl_release(struct drbd_tconn *tconn, unsigned int barrier_nr, - unsigned int set_size) +static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, + union drbd_state ns, enum chg_state_flags flags) { - struct drbd_request *r; - struct drbd_request *req = NULL; - int expect_epoch = 0; - int expect_size = 0; - - spin_lock_irq(&tconn->req_lock); - - /* find oldest not yet barrier-acked write request, - * count writes in its epoch. */ - list_for_each_entry(r, &tconn->transfer_log, tl_requests) { - const unsigned s = r->rq_state; - if (!req) { - if (!(s & RQ_WRITE)) - continue; - if (!(s & RQ_NET_MASK)) - continue; - if (s & RQ_NET_DONE) - continue; - req = r; - expect_epoch = req->epoch; - expect_size ++; - } else { - if (r->epoch != expect_epoch) - break; - if (!(s & RQ_WRITE)) - continue; - /* if (s & RQ_DONE): not expected */ - /* if (!(s & RQ_NET_MASK)): not expected */ - expect_size++; + enum drbd_fencing_p fp; + enum drbd_req_event what = nothing; + union drbd_state nsm = (union drbd_state){ .i = -1 }; + + if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) { + clear_bit(CRASHED_PRIMARY, &mdev->flags); + if (mdev->p_uuid) + mdev->p_uuid[UI_FLAGS] &= ~((u64)2); + } + + fp = FP_DONT_CARE; + if (get_ldev(mdev)) { + fp = mdev->ldev->dc.fencing; + put_ldev(mdev); + } + + /* Inform userspace about the change... */ + drbd_bcast_state(mdev, ns); + + if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) && + (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)) + drbd_khelper(mdev, "pri-on-incon-degr"); + + /* Here we have the actions that are performed after a + state change. This function might sleep */ + + if (os.disk <= D_NEGOTIATING && ns.disk > D_NEGOTIATING) + mod_timer(&mdev->request_timer, jiffies + HZ); + + nsm.i = -1; + if (ns.susp_nod) { + if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) + what = resend; + + if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) && + ns.disk > D_NEGOTIATING) + what = restart_frozen_disk_io; + + if (what != nothing) + nsm.susp_nod = 0; + } + + if (ns.susp_fen) { + /* case1: The outdate peer handler is successful: */ + if (os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) { + if (test_bit(NEW_CUR_UUID, &mdev->flags)) { + drbd_uuid_new_current(mdev); + clear_bit(NEW_CUR_UUID, &mdev->flags); + } + spin_lock_irq(&mdev->req_lock); + _tl_clear(mdev); + _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL); + spin_unlock_irq(&mdev->req_lock); + } + /* case2: The connection was established again: */ + if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) { + clear_bit(NEW_CUR_UUID, &mdev->flags); + what = resend; + nsm.susp_fen = 0; } } - /* first some paranoia code */ - if (req == NULL) { - conn_err(tconn, "BAD! BarrierAck #%u received, but no epoch in tl!?\n", - barrier_nr); - goto bail; + if (what != nothing) { + spin_lock_irq(&mdev->req_lock); + _tl_restart(mdev, what); + nsm.i &= mdev->state.i; + _drbd_set_state(mdev, nsm, CS_VERBOSE, NULL); + spin_unlock_irq(&mdev->req_lock); } - if (expect_epoch != barrier_nr) { - conn_err(tconn, "BAD! BarrierAck #%u received, expected #%u!\n", - barrier_nr, expect_epoch); - goto bail; + + /* Became sync source. With protocol >= 96, we still need to send out + * the sync uuid now. Need to do that before any drbd_send_state, or + * the other side may go "paused sync" before receiving the sync uuids, + * which is unexpected. */ + if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) && + (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) && + mdev->agreed_pro_version >= 96 && get_ldev(mdev)) { + drbd_gen_and_send_sync_uuid(mdev); + put_ldev(mdev); } - if (expect_size != set_size) { - conn_err(tconn, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n", - barrier_nr, set_size, expect_size); - goto bail; + /* Do not change the order of the if above and the two below... */ + if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */ + /* we probably will start a resync soon. + * make sure those things are properly reset. */ + mdev->rs_total = 0; + mdev->rs_failed = 0; + atomic_set(&mdev->rs_pending_cnt, 0); + drbd_rs_cancel_all(mdev); + + drbd_send_uuids(mdev); + drbd_send_state(mdev, ns); + } + /* No point in queuing send_bitmap if we don't have a connection + * anymore, so check also the _current_ state, not only the new state + * at the time this work was queued. */ + if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S && + mdev->state.conn == C_WF_BITMAP_S) + drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL, + "send_bitmap (WFBitMapS)", + BM_LOCKED_TEST_ALLOWED); + + /* Lost contact to peer's copy of the data */ + if ((os.pdsk >= D_INCONSISTENT && + os.pdsk != D_UNKNOWN && + os.pdsk != D_OUTDATED) + && (ns.pdsk < D_INCONSISTENT || + ns.pdsk == D_UNKNOWN || + ns.pdsk == D_OUTDATED)) { + if (get_ldev(mdev)) { + if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) && + mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { + if (is_susp(mdev->state)) { + set_bit(NEW_CUR_UUID, &mdev->flags); + } else { + drbd_uuid_new_current(mdev); + drbd_send_uuids(mdev); + } + } + put_ldev(mdev); + } } - /* Clean up list of requests processed during current epoch. */ - /* this extra list walk restart is paranoia, - * to catch requests being barrier-acked "unexpectedly". - * It usually should find the same req again, or some READ preceding it. */ - list_for_each_entry(req, &tconn->transfer_log, tl_requests) - if (req->epoch == expect_epoch) - break; - list_for_each_entry_safe_from(req, r, &tconn->transfer_log, tl_requests) { - if (req->epoch != expect_epoch) - break; - _req_mod(req, BARRIER_ACKED); + if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) { + if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY && + mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { + drbd_uuid_new_current(mdev); + drbd_send_uuids(mdev); + } + /* D_DISKLESS Peer becomes secondary */ + if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY) + /* We may still be Primary ourselves. + * No harm done if the bitmap still changes, + * redirtied pages will follow later. */ + drbd_bitmap_io_from_worker(mdev, &drbd_bm_write, + "demote diskless peer", BM_LOCKED_SET_ALLOWED); + put_ldev(mdev); } - spin_unlock_irq(&tconn->req_lock); - return; + /* Write out all changed bits on demote. + * Though, no need to da that just yet + * if there is a resync going on still */ + if (os.role == R_PRIMARY && ns.role == R_SECONDARY && + mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) { + /* No changes to the bitmap expected this time, so assert that, + * even though no harm was done if it did change. */ + drbd_bitmap_io_from_worker(mdev, &drbd_bm_write, + "demote", BM_LOCKED_TEST_ALLOWED); + put_ldev(mdev); + } -bail: - spin_unlock_irq(&tconn->req_lock); - conn_request_state(tconn, NS(conn, C_PROTOCOL_ERROR), CS_HARD); -} + /* Last part of the attaching process ... */ + if (ns.conn >= C_CONNECTED && + os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) { + drbd_send_sizes(mdev, 0, 0); /* to start sync... */ + drbd_send_uuids(mdev); + drbd_send_state(mdev, ns); + } + /* We want to pause/continue resync, tell peer. */ + if (ns.conn >= C_CONNECTED && + ((os.aftr_isp != ns.aftr_isp) || + (os.user_isp != ns.user_isp))) + drbd_send_state(mdev, ns); + + /* In case one of the isp bits got set, suspend other devices. */ + if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) && + (ns.aftr_isp || ns.peer_isp || ns.user_isp)) + suspend_other_sg(mdev); + + /* Make sure the peer gets informed about eventual state + changes (ISP bits) while we were in WFReportParams. */ + if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED) + drbd_send_state(mdev, ns); + + if (os.conn != C_AHEAD && ns.conn == C_AHEAD) + drbd_send_state(mdev, ns); + + /* We are in the progress to start a full sync... */ + if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || + (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S)) + /* no other bitmap changes expected during this phase */ + drbd_queue_bitmap_io(mdev, + &drbd_bmio_set_n_write, &abw_start_sync, + "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED); + + /* We are invalidating our self... */ + if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED && + os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT) + /* other bitmap operation expected during this phase */ + drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, + "set_n_write from invalidate", BM_LOCKED_MASK); + + /* first half of local IO error, failure to attach, + * or administrative detach */ + if (os.disk != D_FAILED && ns.disk == D_FAILED) { + enum drbd_io_error_p eh = EP_PASS_ON; + int was_io_error = 0; + /* corresponding get_ldev was in __drbd_set_state, to serialize + * our cleanup here with the transition to D_DISKLESS. + * But is is still not save to dreference ldev here, since + * we might come from an failed Attach before ldev was set. */ + if (mdev->ldev) { + eh = mdev->ldev->dc.on_io_error; + was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags); + + if (was_io_error && eh == EP_CALL_HELPER) + drbd_khelper(mdev, "local-io-error"); + + /* Immediately allow completion of all application IO, + * that waits for completion from the local disk, + * if this was a force-detach due to disk_timeout + * or administrator request (drbdsetup detach --force). + * Do NOT abort otherwise. + * Aborting local requests may cause serious problems, + * if requests are completed to upper layers already, + * and then later the already submitted local bio completes. + * This can cause DMA into former bio pages that meanwhile + * have been re-used for other things. + * So aborting local requests may cause crashes, + * or even worse, silent data corruption. + */ + if (test_and_clear_bit(FORCE_DETACH, &mdev->flags)) + tl_abort_disk_io(mdev); + + /* current state still has to be D_FAILED, + * there is only one way out: to D_DISKLESS, + * and that may only happen after our put_ldev below. */ + if (mdev->state.disk != D_FAILED) + dev_err(DEV, + "ASSERT FAILED: disk is %s during detach\n", + drbd_disk_str(mdev->state.disk)); + + if (ns.conn >= C_CONNECTED) + drbd_send_state(mdev, ns); + + drbd_rs_cancel_all(mdev); + + /* In case we want to get something to stable storage still, + * this may be the last chance. + * Following put_ldev may transition to D_DISKLESS. */ + drbd_md_sync(mdev); + } + put_ldev(mdev); + } -/** - * _tl_restart() - Walks the transfer log, and applies an action to all requests - * @mdev: DRBD device. - * @what: The action/event to perform with all request objects - * - * @what might be one of CONNECTION_LOST_WHILE_PENDING, RESEND, FAIL_FROZEN_DISK_IO, - * RESTART_FROZEN_DISK_IO. - */ -/* must hold resource->req_lock */ -void _tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what) -{ - struct drbd_request *req, *r; + /* second half of local IO error, failure to attach, + * or administrative detach, + * after local_cnt references have reached zero again */ + if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) { + /* We must still be diskless, + * re-attach has to be serialized with this! */ + if (mdev->state.disk != D_DISKLESS) + dev_err(DEV, + "ASSERT FAILED: disk is %s while going diskless\n", + drbd_disk_str(mdev->state.disk)); + + if (ns.conn >= C_CONNECTED) + drbd_send_state(mdev, ns); + + /* corresponding get_ldev in __drbd_set_state + * this may finally trigger drbd_ldev_destroy. */ + put_ldev(mdev); + } - list_for_each_entry_safe(req, r, &tconn->transfer_log, tl_requests) - _req_mod(req, what); -} + /* Notify peer that I had a local IO error, and did not detached.. */ + if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED) + drbd_send_state(mdev, ns); -void tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what) -{ - spin_lock_irq(&tconn->req_lock); - _tl_restart(tconn, what); - spin_unlock_irq(&tconn->req_lock); -} + /* Disks got bigger while they were detached */ + if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING && + test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) { + if (ns.conn == C_CONNECTED) + resync_after_online_grow(mdev); + } -/** - * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL - * @mdev: DRBD device. - * - * This is called after the connection to the peer was lost. The storage covered - * by the requests on the transfer gets marked as our of sync. Called from the - * receiver thread and the worker thread. - */ -void tl_clear(struct drbd_tconn *tconn) -{ - tl_restart(tconn, CONNECTION_LOST_WHILE_PENDING); -} + /* A resync finished or aborted, wake paused devices... */ + if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) || + (os.peer_isp && !ns.peer_isp) || + (os.user_isp && !ns.user_isp)) + resume_next_sg(mdev); + + /* sync target done with resync. Explicitly notify peer, even though + * it should (at least for non-empty resyncs) already know itself. */ + if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED) + drbd_send_state(mdev, ns); + + /* Wake up role changes, that were delayed because of connection establishing */ + if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS) { + clear_bit(STATE_SENT, &mdev->flags); + wake_up(&mdev->state_wait); + } -/** - * tl_abort_disk_io() - Abort disk I/O for all requests for a certain mdev in the TL - * @mdev: DRBD device. - */ -void tl_abort_disk_io(struct drbd_conf *mdev) -{ - struct drbd_tconn *tconn = mdev->tconn; - struct drbd_request *req, *r; + /* This triggers bitmap writeout of potentially still unwritten pages + * if the resync finished cleanly, or aborted because of peer disk + * failure, or because of connection loss. + * For resync aborted because of local disk failure, we cannot do + * any bitmap writeout anymore. + * No harm done if some bits change during this phase. + */ + if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) { + drbd_queue_bitmap_io(mdev, &drbd_bm_write_copy_pages, NULL, + "write from resync_finished", BM_LOCKED_CHANGE_ALLOWED); + put_ldev(mdev); + } - spin_lock_irq(&tconn->req_lock); - list_for_each_entry_safe(req, r, &tconn->transfer_log, tl_requests) { - if (!(req->rq_state & RQ_LOCAL_PENDING)) - continue; - if (req->w.mdev != mdev) - continue; - _req_mod(req, ABORT_DISK_IO); + /* free tl_hash if we Got thawed and are C_STANDALONE */ + if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash) + drbd_free_tl_hash(mdev); + + /* Upon network connection, we need to start the receiver */ + if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED) + drbd_thread_start(&mdev->receiver); + + /* Terminate worker thread if we are unconfigured - it will be + restarted as needed... */ + if (ns.disk == D_DISKLESS && + ns.conn == C_STANDALONE && + ns.role == R_SECONDARY) { + if (os.aftr_isp != ns.aftr_isp) + resume_next_sg(mdev); + /* set in __drbd_set_state, unless CONFIG_PENDING was set */ + if (test_bit(DEVICE_DYING, &mdev->flags)) + drbd_thread_stop_nowait(&mdev->worker); } - spin_unlock_irq(&tconn->req_lock); + + drbd_md_sync(mdev); } + static int drbd_thread_setup(void *arg) { struct drbd_thread *thi = (struct drbd_thread *) arg; - struct drbd_tconn *tconn = thi->tconn; + struct drbd_conf *mdev = thi->mdev; unsigned long flags; int retval; - snprintf(current->comm, sizeof(current->comm), "drbd_%c_%s", - thi->name[0], thi->tconn->name); - restart: retval = thi->function(thi); spin_lock_irqsave(&thi->t_lock, flags); - /* if the receiver has been "EXITING", the last thing it did + /* if the receiver has been "Exiting", the last thing it did * was set the conn state to "StandAlone", * if now a re-connect request comes in, conn state goes C_UNCONNECTED, * and receiver thread will be "started". - * drbd_thread_start needs to set "RESTARTING" in that case. + * drbd_thread_start needs to set "Restarting" in that case. * t_state check and assignment needs to be within the same spinlock, - * so either thread_start sees EXITING, and can remap to RESTARTING, - * or thread_start see NONE, and can proceed as normal. + * so either thread_start sees Exiting, and can remap to Restarting, + * or thread_start see None, and can proceed as normal. */ - if (thi->t_state == RESTARTING) { - conn_info(tconn, "Restarting %s thread\n", thi->name); - thi->t_state = RUNNING; + if (thi->t_state == Restarting) { + dev_info(DEV, "Restarting %s\n", current->comm); + thi->t_state = Running; spin_unlock_irqrestore(&thi->t_lock, flags); goto restart; } thi->task = NULL; - thi->t_state = NONE; + thi->t_state = None; smp_mb(); - complete_all(&thi->stop); + complete(&thi->stop); spin_unlock_irqrestore(&thi->t_lock, flags); - conn_info(tconn, "Terminating %s\n", current->comm); + dev_info(DEV, "Terminating %s\n", current->comm); /* Release mod reference taken when thread was started */ - - kref_put(&tconn->kref, &conn_destroy); module_put(THIS_MODULE); return retval; } -static void drbd_thread_init(struct drbd_tconn *tconn, struct drbd_thread *thi, - int (*func) (struct drbd_thread *), char *name) +static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi, + int (*func) (struct drbd_thread *)) { spin_lock_init(&thi->t_lock); thi->task = NULL; - thi->t_state = NONE; + thi->t_state = None; thi->function = func; - thi->tconn = tconn; - strncpy(thi->name, name, ARRAY_SIZE(thi->name)); + thi->mdev = mdev; } int drbd_thread_start(struct drbd_thread *thi) { - struct drbd_tconn *tconn = thi->tconn; + struct drbd_conf *mdev = thi->mdev; struct task_struct *nt; unsigned long flags; + const char *me = + thi == &mdev->receiver ? "receiver" : + thi == &mdev->asender ? "asender" : + thi == &mdev->worker ? "worker" : "NONSENSE"; + /* is used from state engine doing drbd_thread_stop_nowait, * while holding the req lock irqsave */ spin_lock_irqsave(&thi->t_lock, flags); switch (thi->t_state) { - case NONE: - conn_info(tconn, "Starting %s thread (from %s [%d])\n", - thi->name, current->comm, current->pid); + case None: + dev_info(DEV, "Starting %s thread (from %s [%d])\n", + me, current->comm, current->pid); /* Get ref on module for thread - this is released when thread exits */ if (!try_module_get(THIS_MODULE)) { - conn_err(tconn, "Failed to get module reference in drbd_thread_start\n"); + dev_err(DEV, "Failed to get module reference in drbd_thread_start\n"); spin_unlock_irqrestore(&thi->t_lock, flags); return false; } - kref_get(&thi->tconn->kref); - init_completion(&thi->stop); + D_ASSERT(thi->task == NULL); thi->reset_cpu_mask = 1; - thi->t_state = RUNNING; + thi->t_state = Running; spin_unlock_irqrestore(&thi->t_lock, flags); flush_signals(current); /* otherw. may get -ERESTARTNOINTR */ nt = kthread_create(drbd_thread_setup, (void *) thi, - "drbd_%c_%s", thi->name[0], thi->tconn->name); + "drbd%d_%s", mdev_to_minor(mdev), me); if (IS_ERR(nt)) { - conn_err(tconn, "Couldn't start thread\n"); + dev_err(DEV, "Couldn't start thread\n"); - kref_put(&tconn->kref, &conn_destroy); module_put(THIS_MODULE); return false; } spin_lock_irqsave(&thi->t_lock, flags); thi->task = nt; - thi->t_state = RUNNING; + thi->t_state = Running; spin_unlock_irqrestore(&thi->t_lock, flags); wake_up_process(nt); break; - case EXITING: - thi->t_state = RESTARTING; - conn_info(tconn, "Restarting %s thread (from %s [%d])\n", - thi->name, current->comm, current->pid); + case Exiting: + thi->t_state = Restarting; + dev_info(DEV, "Restarting %s thread (from %s [%d])\n", + me, current->comm, current->pid); /* fall through */ - case RUNNING: - case RESTARTING: + case Running: + case Restarting: default: spin_unlock_irqrestore(&thi->t_lock, flags); break; @@ -447,12 +1867,12 @@ void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait) { unsigned long flags; - enum drbd_thread_state ns = restart ? RESTARTING : EXITING; + enum drbd_thread_state ns = restart ? Restarting : Exiting; /* may be called from state engine, holding the req lock irqsave */ spin_lock_irqsave(&thi->t_lock, flags); - if (thi->t_state == NONE) { + if (thi->t_state == None) { spin_unlock_irqrestore(&thi->t_lock, flags); if (restart) drbd_thread_start(thi); @@ -470,6 +1890,7 @@ void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait) init_completion(&thi->stop); if (thi->task != current) force_sig(DRBD_SIGKILL, thi->task); + } spin_unlock_irqrestore(&thi->t_lock, flags); @@ -478,35 +1899,6 @@ void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait) wait_for_completion(&thi->stop); } -static struct drbd_thread *drbd_task_to_thread(struct drbd_tconn *tconn, struct task_struct *task) -{ - struct drbd_thread *thi = - task == tconn->receiver.task ? &tconn->receiver : - task == tconn->asender.task ? &tconn->asender : - task == tconn->worker.task ? &tconn->worker : NULL; - - return thi; -} - -char *drbd_task_to_thread_name(struct drbd_tconn *tconn, struct task_struct *task) -{ - struct drbd_thread *thi = drbd_task_to_thread(tconn, task); - return thi ? thi->name : task->comm; -} - -int conn_lowest_minor(struct drbd_tconn *tconn) -{ - struct drbd_conf *mdev; - int vnr = 0, m; - - rcu_read_lock(); - mdev = idr_get_next(&tconn->volumes, &vnr); - m = mdev ? mdev_to_minor(mdev) : -1; - rcu_read_unlock(); - - return m; -} - #ifdef CONFIG_SMP /** * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs @@ -515,345 +1907,238 @@ int conn_lowest_minor(struct drbd_tconn *tconn) * Forces all threads of a device onto the same CPU. This is beneficial for * DRBD's performance. May be overwritten by user's configuration. */ -void drbd_calc_cpu_mask(struct drbd_tconn *tconn) +void drbd_calc_cpu_mask(struct drbd_conf *mdev) { int ord, cpu; /* user override. */ - if (cpumask_weight(tconn->cpu_mask)) + if (cpumask_weight(mdev->cpu_mask)) return; - ord = conn_lowest_minor(tconn) % cpumask_weight(cpu_online_mask); + ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask); for_each_online_cpu(cpu) { if (ord-- == 0) { - cpumask_set_cpu(cpu, tconn->cpu_mask); + cpumask_set_cpu(cpu, mdev->cpu_mask); return; } } /* should not be reached */ - cpumask_setall(tconn->cpu_mask); + cpumask_setall(mdev->cpu_mask); } /** * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread * @mdev: DRBD device. - * @thi: drbd_thread object * * call in the "main loop" of _all_ threads, no need for any mutex, current won't die * prematurely. */ -void drbd_thread_current_set_cpu(struct drbd_thread *thi) +void drbd_thread_current_set_cpu(struct drbd_conf *mdev) { struct task_struct *p = current; - + struct drbd_thread *thi = + p == mdev->asender.task ? &mdev->asender : + p == mdev->receiver.task ? &mdev->receiver : + p == mdev->worker.task ? &mdev->worker : + NULL; + ERR_IF(thi == NULL) + return; if (!thi->reset_cpu_mask) return; thi->reset_cpu_mask = 0; - set_cpus_allowed_ptr(p, thi->tconn->cpu_mask); + set_cpus_allowed_ptr(p, mdev->cpu_mask); } #endif -/** - * drbd_header_size - size of a packet header - * - * The header size is a multiple of 8, so any payload following the header is - * word aligned on 64-bit architectures. (The bitmap send and receive code - * relies on this.) - */ -unsigned int drbd_header_size(struct drbd_tconn *tconn) -{ - if (tconn->agreed_pro_version >= 100) { - BUILD_BUG_ON(!IS_ALIGNED(sizeof(struct p_header100), 8)); - return sizeof(struct p_header100); - } else { - BUILD_BUG_ON(sizeof(struct p_header80) != - sizeof(struct p_header95)); - BUILD_BUG_ON(!IS_ALIGNED(sizeof(struct p_header80), 8)); - return sizeof(struct p_header80); - } -} - -static unsigned int prepare_header80(struct p_header80 *h, enum drbd_packet cmd, int size) +/* the appropriate socket mutex must be held already */ +int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock, + enum drbd_packets cmd, struct p_header80 *h, + size_t size, unsigned msg_flags) { - h->magic = cpu_to_be32(DRBD_MAGIC); - h->command = cpu_to_be16(cmd); - h->length = cpu_to_be16(size); - return sizeof(struct p_header80); -} + int sent, ok; -static unsigned int prepare_header95(struct p_header95 *h, enum drbd_packet cmd, int size) -{ - h->magic = cpu_to_be16(DRBD_MAGIC_BIG); - h->command = cpu_to_be16(cmd); - h->length = cpu_to_be32(size); - return sizeof(struct p_header95); -} + ERR_IF(!h) return false; + ERR_IF(!size) return false; -static unsigned int prepare_header100(struct p_header100 *h, enum drbd_packet cmd, - int size, int vnr) -{ - h->magic = cpu_to_be32(DRBD_MAGIC_100); - h->volume = cpu_to_be16(vnr); + h->magic = BE_DRBD_MAGIC; h->command = cpu_to_be16(cmd); - h->length = cpu_to_be32(size); - h->pad = 0; - return sizeof(struct p_header100); -} - -static unsigned int prepare_header(struct drbd_tconn *tconn, int vnr, - void *buffer, enum drbd_packet cmd, int size) -{ - if (tconn->agreed_pro_version >= 100) - return prepare_header100(buffer, cmd, size, vnr); - else if (tconn->agreed_pro_version >= 95 && - size > DRBD_MAX_SIZE_H80_PACKET) - return prepare_header95(buffer, cmd, size); - else - return prepare_header80(buffer, cmd, size); -} - -static void *__conn_prepare_command(struct drbd_tconn *tconn, - struct drbd_socket *sock) -{ - if (!sock->socket) - return NULL; - return sock->sbuf + drbd_header_size(tconn); -} - -void *conn_prepare_command(struct drbd_tconn *tconn, struct drbd_socket *sock) -{ - void *p; - - mutex_lock(&sock->mutex); - p = __conn_prepare_command(tconn, sock); - if (!p) - mutex_unlock(&sock->mutex); + h->length = cpu_to_be16(size-sizeof(struct p_header80)); - return p; -} + sent = drbd_send(mdev, sock, h, size, msg_flags); -void *drbd_prepare_command(struct drbd_conf *mdev, struct drbd_socket *sock) -{ - return conn_prepare_command(mdev->tconn, sock); + ok = (sent == size); + if (!ok && !signal_pending(current)) + dev_warn(DEV, "short sent %s size=%d sent=%d\n", + cmdname(cmd), (int)size, sent); + return ok; } -static int __send_command(struct drbd_tconn *tconn, int vnr, - struct drbd_socket *sock, enum drbd_packet cmd, - unsigned int header_size, void *data, - unsigned int size) +/* don't pass the socket. we may only look at it + * when we hold the appropriate socket mutex. + */ +int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket, + enum drbd_packets cmd, struct p_header80 *h, size_t size) { - int msg_flags; - int err; - - /* - * Called with @data == NULL and the size of the data blocks in @size - * for commands that send data blocks. For those commands, omit the - * MSG_MORE flag: this will increase the likelihood that data blocks - * which are page aligned on the sender will end up page aligned on the - * receiver. - */ - msg_flags = data ? MSG_MORE : 0; - - header_size += prepare_header(tconn, vnr, sock->sbuf, cmd, - header_size + size); - err = drbd_send_all(tconn, sock->socket, sock->sbuf, header_size, - msg_flags); - if (data && !err) - err = drbd_send_all(tconn, sock->socket, data, size, 0); - return err; -} + int ok = 0; + struct socket *sock; -static int __conn_send_command(struct drbd_tconn *tconn, struct drbd_socket *sock, - enum drbd_packet cmd, unsigned int header_size, - void *data, unsigned int size) -{ - return __send_command(tconn, 0, sock, cmd, header_size, data, size); -} + if (use_data_socket) { + mutex_lock(&mdev->data.mutex); + sock = mdev->data.socket; + } else { + mutex_lock(&mdev->meta.mutex); + sock = mdev->meta.socket; + } -int conn_send_command(struct drbd_tconn *tconn, struct drbd_socket *sock, - enum drbd_packet cmd, unsigned int header_size, - void *data, unsigned int size) -{ - int err; + /* drbd_disconnect() could have called drbd_free_sock() + * while we were waiting in down()... */ + if (likely(sock != NULL)) + ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0); - err = __conn_send_command(tconn, sock, cmd, header_size, data, size); - mutex_unlock(&sock->mutex); - return err; + if (use_data_socket) + mutex_unlock(&mdev->data.mutex); + else + mutex_unlock(&mdev->meta.mutex); + return ok; } -int drbd_send_command(struct drbd_conf *mdev, struct drbd_socket *sock, - enum drbd_packet cmd, unsigned int header_size, - void *data, unsigned int size) +int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data, + size_t size) { - int err; + struct p_header80 h; + int ok; - err = __send_command(mdev->tconn, mdev->vnr, sock, cmd, header_size, - data, size); - mutex_unlock(&sock->mutex); - return err; -} + h.magic = BE_DRBD_MAGIC; + h.command = cpu_to_be16(cmd); + h.length = cpu_to_be16(size); -int drbd_send_ping(struct drbd_tconn *tconn) -{ - struct drbd_socket *sock; + if (!drbd_get_data_sock(mdev)) + return 0; - sock = &tconn->meta; - if (!conn_prepare_command(tconn, sock)) - return -EIO; - return conn_send_command(tconn, sock, P_PING, 0, NULL, 0); -} + ok = (sizeof(h) == + drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0)); + ok = ok && (size == + drbd_send(mdev, mdev->data.socket, data, size, 0)); -int drbd_send_ping_ack(struct drbd_tconn *tconn) -{ - struct drbd_socket *sock; + drbd_put_data_sock(mdev); - sock = &tconn->meta; - if (!conn_prepare_command(tconn, sock)) - return -EIO; - return conn_send_command(tconn, sock, P_PING_ACK, 0, NULL, 0); + return ok; } -int drbd_send_sync_param(struct drbd_conf *mdev) +int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc) { - struct drbd_socket *sock; struct p_rs_param_95 *p; - int size; - const int apv = mdev->tconn->agreed_pro_version; - enum drbd_packet cmd; - struct net_conf *nc; - struct disk_conf *dc; - - sock = &mdev->tconn->data; - p = drbd_prepare_command(mdev, sock); - if (!p) - return -EIO; - - rcu_read_lock(); - nc = rcu_dereference(mdev->tconn->net_conf); + struct socket *sock; + int size, rv; + const int apv = mdev->agreed_pro_version; size = apv <= 87 ? sizeof(struct p_rs_param) : apv == 88 ? sizeof(struct p_rs_param) - + strlen(nc->verify_alg) + 1 + + strlen(mdev->sync_conf.verify_alg) + 1 : apv <= 94 ? sizeof(struct p_rs_param_89) : /* apv >= 95 */ sizeof(struct p_rs_param_95); - cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM; + /* used from admin command context and receiver/worker context. + * to avoid kmalloc, grab the socket right here, + * then use the pre-allocated sbuf there */ + mutex_lock(&mdev->data.mutex); + sock = mdev->data.socket; - /* initialize verify_alg and csums_alg */ - memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX); + if (likely(sock != NULL)) { + enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM; - if (get_ldev(mdev)) { - dc = rcu_dereference(mdev->ldev->disk_conf); - p->resync_rate = cpu_to_be32(dc->resync_rate); - p->c_plan_ahead = cpu_to_be32(dc->c_plan_ahead); - p->c_delay_target = cpu_to_be32(dc->c_delay_target); - p->c_fill_target = cpu_to_be32(dc->c_fill_target); - p->c_max_rate = cpu_to_be32(dc->c_max_rate); - put_ldev(mdev); - } else { - p->resync_rate = cpu_to_be32(DRBD_RESYNC_RATE_DEF); - p->c_plan_ahead = cpu_to_be32(DRBD_C_PLAN_AHEAD_DEF); - p->c_delay_target = cpu_to_be32(DRBD_C_DELAY_TARGET_DEF); - p->c_fill_target = cpu_to_be32(DRBD_C_FILL_TARGET_DEF); - p->c_max_rate = cpu_to_be32(DRBD_C_MAX_RATE_DEF); - } + p = &mdev->data.sbuf.rs_param_95; + + /* initialize verify_alg and csums_alg */ + memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX); + + p->rate = cpu_to_be32(sc->rate); + p->c_plan_ahead = cpu_to_be32(sc->c_plan_ahead); + p->c_delay_target = cpu_to_be32(sc->c_delay_target); + p->c_fill_target = cpu_to_be32(sc->c_fill_target); + p->c_max_rate = cpu_to_be32(sc->c_max_rate); - if (apv >= 88) - strcpy(p->verify_alg, nc->verify_alg); - if (apv >= 89) - strcpy(p->csums_alg, nc->csums_alg); - rcu_read_unlock(); + if (apv >= 88) + strcpy(p->verify_alg, mdev->sync_conf.verify_alg); + if (apv >= 89) + strcpy(p->csums_alg, mdev->sync_conf.csums_alg); - return drbd_send_command(mdev, sock, cmd, size, NULL, 0); + rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0); + } else + rv = 0; /* not ok */ + + mutex_unlock(&mdev->data.mutex); + + return rv; } -int __drbd_send_protocol(struct drbd_tconn *tconn, enum drbd_packet cmd) +int drbd_send_protocol(struct drbd_conf *mdev) { - struct drbd_socket *sock; struct p_protocol *p; - struct net_conf *nc; - int size, cf; + int size, cf, rv; - sock = &tconn->data; - p = __conn_prepare_command(tconn, sock); - if (!p) - return -EIO; + size = sizeof(struct p_protocol); - rcu_read_lock(); - nc = rcu_dereference(tconn->net_conf); + if (mdev->agreed_pro_version >= 87) + size += strlen(mdev->net_conf->integrity_alg) + 1; - if (nc->tentative && tconn->agreed_pro_version < 92) { - rcu_read_unlock(); - mutex_unlock(&sock->mutex); - conn_err(tconn, "--dry-run is not supported by peer"); - return -EOPNOTSUPP; - } + /* we must not recurse into our own queue, + * as that is blocked during handshake */ + p = kmalloc(size, GFP_NOIO); + if (p == NULL) + return 0; - size = sizeof(*p); - if (tconn->agreed_pro_version >= 87) - size += strlen(nc->integrity_alg) + 1; + p->protocol = cpu_to_be32(mdev->net_conf->wire_protocol); + p->after_sb_0p = cpu_to_be32(mdev->net_conf->after_sb_0p); + p->after_sb_1p = cpu_to_be32(mdev->net_conf->after_sb_1p); + p->after_sb_2p = cpu_to_be32(mdev->net_conf->after_sb_2p); + p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries); - p->protocol = cpu_to_be32(nc->wire_protocol); - p->after_sb_0p = cpu_to_be32(nc->after_sb_0p); - p->after_sb_1p = cpu_to_be32(nc->after_sb_1p); - p->after_sb_2p = cpu_to_be32(nc->after_sb_2p); - p->two_primaries = cpu_to_be32(nc->two_primaries); cf = 0; - if (nc->discard_my_data) - cf |= CF_DISCARD_MY_DATA; - if (nc->tentative) - cf |= CF_DRY_RUN; + if (mdev->net_conf->want_lose) + cf |= CF_WANT_LOSE; + if (mdev->net_conf->dry_run) { + if (mdev->agreed_pro_version >= 92) + cf |= CF_DRY_RUN; + else { + dev_err(DEV, "--dry-run is not supported by peer"); + kfree(p); + return -1; + } + } p->conn_flags = cpu_to_be32(cf); - if (tconn->agreed_pro_version >= 87) - strcpy(p->integrity_alg, nc->integrity_alg); - rcu_read_unlock(); - - return __conn_send_command(tconn, sock, cmd, size, NULL, 0); -} - -int drbd_send_protocol(struct drbd_tconn *tconn) -{ - int err; + if (mdev->agreed_pro_version >= 87) + strcpy(p->integrity_alg, mdev->net_conf->integrity_alg); - mutex_lock(&tconn->data.mutex); - err = __drbd_send_protocol(tconn, P_PROTOCOL); - mutex_unlock(&tconn->data.mutex); - - return err; + rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL, + (struct p_header80 *)p, size); + kfree(p); + return rv; } int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags) { - struct drbd_socket *sock; - struct p_uuids *p; + struct p_uuids p; int i; if (!get_ldev_if_state(mdev, D_NEGOTIATING)) - return 0; + return 1; - sock = &mdev->tconn->data; - p = drbd_prepare_command(mdev, sock); - if (!p) { - put_ldev(mdev); - return -EIO; - } - spin_lock_irq(&mdev->ldev->md.uuid_lock); for (i = UI_CURRENT; i < UI_SIZE; i++) - p->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]); - spin_unlock_irq(&mdev->ldev->md.uuid_lock); + p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0; mdev->comm_bm_set = drbd_bm_total_weight(mdev); - p->uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set); - rcu_read_lock(); - uuid_flags |= rcu_dereference(mdev->tconn->net_conf)->discard_my_data ? 1 : 0; - rcu_read_unlock(); + p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set); + uuid_flags |= mdev->net_conf->want_lose ? 1 : 0; uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0; uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0; - p->uuid[UI_FLAGS] = cpu_to_be64(uuid_flags); + p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags); put_ldev(mdev); - return drbd_send_command(mdev, sock, P_UUIDS, sizeof(*p), NULL, 0); + + return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS, + (struct p_header80 *)&p, sizeof(p)); } int drbd_send_uuids(struct drbd_conf *mdev) @@ -884,10 +2169,9 @@ void drbd_print_uuids(struct drbd_conf *mdev, const char *text) } } -void drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev) +int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev) { - struct drbd_socket *sock; - struct p_rs_uuid *p; + struct p_rs_uuid p; u64 uuid; D_ASSERT(mdev->state.disk == D_UP_TO_DATE); @@ -900,29 +2184,24 @@ void drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev) drbd_uuid_set(mdev, UI_BITMAP, uuid); drbd_print_uuids(mdev, "updated sync UUID"); drbd_md_sync(mdev); + p.uuid = cpu_to_be64(uuid); - sock = &mdev->tconn->data; - p = drbd_prepare_command(mdev, sock); - if (p) { - p->uuid = cpu_to_be64(uuid); - drbd_send_command(mdev, sock, P_SYNC_UUID, sizeof(*p), NULL, 0); - } + return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID, + (struct p_header80 *)&p, sizeof(p)); } int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags) { - struct drbd_socket *sock; - struct p_sizes *p; + struct p_sizes p; sector_t d_size, u_size; int q_order_type; unsigned int max_bio_size; + int ok; if (get_ldev_if_state(mdev, D_NEGOTIATING)) { D_ASSERT(mdev->ldev->backing_bdev); d_size = drbd_get_max_capacity(mdev->ldev); - rcu_read_lock(); - u_size = rcu_dereference(mdev->ldev->disk_conf)->disk_size; - rcu_read_unlock(); + u_size = mdev->ldev->dc.disk_size; q_order_type = drbd_queue_order_type(mdev); max_bio_size = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9; max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE); @@ -934,23 +2213,20 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */ } - sock = &mdev->tconn->data; - p = drbd_prepare_command(mdev, sock); - if (!p) - return -EIO; - - if (mdev->tconn->agreed_pro_version <= 94) + /* Never allow old drbd (up to 8.3.7) to see more than 32KiB */ + if (mdev->agreed_pro_version <= 94) max_bio_size = min(max_bio_size, DRBD_MAX_SIZE_H80_PACKET); - else if (mdev->tconn->agreed_pro_version < 100) - max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE_P95); - p->d_size = cpu_to_be64(d_size); - p->u_size = cpu_to_be64(u_size); - p->c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev)); - p->max_bio_size = cpu_to_be32(max_bio_size); - p->queue_order_type = cpu_to_be16(q_order_type); - p->dds_flags = cpu_to_be16(flags); - return drbd_send_command(mdev, sock, P_SIZES, sizeof(*p), NULL, 0); + p.d_size = cpu_to_be64(d_size); + p.u_size = cpu_to_be64(u_size); + p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev)); + p.max_bio_size = cpu_to_be32(max_bio_size); + p.queue_order_type = cpu_to_be16(q_order_type); + p.dds_flags = cpu_to_be16(flags); + + ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES, + (struct p_header80 *)&p, sizeof(p)); + return ok; } /** @@ -959,21 +2235,34 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl */ int drbd_send_current_state(struct drbd_conf *mdev) { - struct drbd_socket *sock; - struct p_state *p; + struct socket *sock; + struct p_state p; + int ok = 0; - sock = &mdev->tconn->data; - p = drbd_prepare_command(mdev, sock); - if (!p) - return -EIO; - p->state = cpu_to_be32(mdev->state.i); /* Within the send mutex */ - return drbd_send_command(mdev, sock, P_STATE, sizeof(*p), NULL, 0); + /* Grab state lock so we wont send state if we're in the middle + * of a cluster wide state change on another thread */ + drbd_state_lock(mdev); + + mutex_lock(&mdev->data.mutex); + + p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */ + sock = mdev->data.socket; + + if (likely(sock != NULL)) { + ok = _drbd_send_cmd(mdev, sock, P_STATE, + (struct p_header80 *)&p, sizeof(p), 0); + } + + mutex_unlock(&mdev->data.mutex); + + drbd_state_unlock(mdev); + return ok; } /** * drbd_send_state() - After a state change, sends the new state to the peer - * @mdev: DRBD device. - * @state: the state to send, not necessarily the current state. + * @mdev: DRBD device. + * @state: the state to send, not necessarily the current state. * * Each state change queues an "after_state_ch" work, which will eventually * send the resulting new state to the peer. If more state changes happen @@ -982,95 +2271,50 @@ int drbd_send_current_state(struct drbd_conf *mdev) */ int drbd_send_state(struct drbd_conf *mdev, union drbd_state state) { - struct drbd_socket *sock; - struct p_state *p; + struct socket *sock; + struct p_state p; + int ok = 0; - sock = &mdev->tconn->data; - p = drbd_prepare_command(mdev, sock); - if (!p) - return -EIO; - p->state = cpu_to_be32(state.i); /* Within the send mutex */ - return drbd_send_command(mdev, sock, P_STATE, sizeof(*p), NULL, 0); -} + mutex_lock(&mdev->data.mutex); -int drbd_send_state_req(struct drbd_conf *mdev, union drbd_state mask, union drbd_state val) -{ - struct drbd_socket *sock; - struct p_req_state *p; + p.state = cpu_to_be32(state.i); + sock = mdev->data.socket; - sock = &mdev->tconn->data; - p = drbd_prepare_command(mdev, sock); - if (!p) - return -EIO; - p->mask = cpu_to_be32(mask.i); - p->val = cpu_to_be32(val.i); - return drbd_send_command(mdev, sock, P_STATE_CHG_REQ, sizeof(*p), NULL, 0); -} + if (likely(sock != NULL)) { + ok = _drbd_send_cmd(mdev, sock, P_STATE, + (struct p_header80 *)&p, sizeof(p), 0); + } -int conn_send_state_req(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val) -{ - enum drbd_packet cmd; - struct drbd_socket *sock; - struct p_req_state *p; + mutex_unlock(&mdev->data.mutex); - cmd = tconn->agreed_pro_version < 100 ? P_STATE_CHG_REQ : P_CONN_ST_CHG_REQ; - sock = &tconn->data; - p = conn_prepare_command(tconn, sock); - if (!p) - return -EIO; - p->mask = cpu_to_be32(mask.i); - p->val = cpu_to_be32(val.i); - return conn_send_command(tconn, sock, cmd, sizeof(*p), NULL, 0); + return ok; } -void drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode) +int drbd_send_state_req(struct drbd_conf *mdev, + union drbd_state mask, union drbd_state val) { - struct drbd_socket *sock; - struct p_req_state_reply *p; - - sock = &mdev->tconn->meta; - p = drbd_prepare_command(mdev, sock); - if (p) { - p->retcode = cpu_to_be32(retcode); - drbd_send_command(mdev, sock, P_STATE_CHG_REPLY, sizeof(*p), NULL, 0); - } -} + struct p_req_state p; -void conn_send_sr_reply(struct drbd_tconn *tconn, enum drbd_state_rv retcode) -{ - struct drbd_socket *sock; - struct p_req_state_reply *p; - enum drbd_packet cmd = tconn->agreed_pro_version < 100 ? P_STATE_CHG_REPLY : P_CONN_ST_CHG_REPLY; + p.mask = cpu_to_be32(mask.i); + p.val = cpu_to_be32(val.i); - sock = &tconn->meta; - p = conn_prepare_command(tconn, sock); - if (p) { - p->retcode = cpu_to_be32(retcode); - conn_send_command(tconn, sock, cmd, sizeof(*p), NULL, 0); - } + return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ, + (struct p_header80 *)&p, sizeof(p)); } -static void dcbp_set_code(struct p_compressed_bm *p, enum drbd_bitmap_code code) +int drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode) { - BUG_ON(code & ~0xf); - p->encoding = (p->encoding & ~0xf) | code; -} + struct p_req_state_reply p; -static void dcbp_set_start(struct p_compressed_bm *p, int set) -{ - p->encoding = (p->encoding & ~0x80) | (set ? 0x80 : 0); -} + p.retcode = cpu_to_be32(retcode); -static void dcbp_set_pad_bits(struct p_compressed_bm *p, int n) -{ - BUG_ON(n & ~0x7); - p->encoding = (p->encoding & (~0x7 << 4)) | (n << 4); + return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY, + (struct p_header80 *)&p, sizeof(p)); } int fill_bitmap_rle_bits(struct drbd_conf *mdev, - struct p_compressed_bm *p, - unsigned int size, - struct bm_xfer_ctx *c) + struct p_compressed_bm *p, + struct bm_xfer_ctx *c) { struct bitstream bs; unsigned long plain_bits; @@ -1078,21 +2322,19 @@ int fill_bitmap_rle_bits(struct drbd_conf *mdev, unsigned long rl; unsigned len; unsigned toggle; - int bits, use_rle; + int bits; /* may we use this feature? */ - rcu_read_lock(); - use_rle = rcu_dereference(mdev->tconn->net_conf)->use_rle; - rcu_read_unlock(); - if (!use_rle || mdev->tconn->agreed_pro_version < 90) - return 0; + if ((mdev->sync_conf.use_rle == 0) || + (mdev->agreed_pro_version < 90)) + return 0; if (c->bit_offset >= c->bm_bits) return 0; /* nothing to do. */ /* use at most thus many bytes */ - bitstream_init(&bs, p->code, size, 0); - memset(p->code, 0, size); + bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0); + memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX); /* plain bits covered in this code string */ plain_bits = 0; @@ -1114,12 +2356,12 @@ int fill_bitmap_rle_bits(struct drbd_conf *mdev, if (rl == 0) { /* the first checked bit was set, * store start value, */ - dcbp_set_start(p, 1); + DCBP_set_start(p, 1); /* but skip encoding of zero run length */ toggle = !toggle; continue; } - dcbp_set_start(p, 0); + DCBP_set_start(p, 0); } /* paranoia: catch zero runlength. @@ -1159,7 +2401,7 @@ int fill_bitmap_rle_bits(struct drbd_conf *mdev, bm_xfer_ctx_bit_to_word_offset(c); /* store pad_bits */ - dcbp_set_pad_bits(p, (8 - bs.cur.bit) & 0x7); + DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7); return len; } @@ -1171,52 +2413,48 @@ int fill_bitmap_rle_bits(struct drbd_conf *mdev, * code upon failure. */ static int -send_bitmap_rle_or_plain(struct drbd_conf *mdev, struct bm_xfer_ctx *c) +send_bitmap_rle_or_plain(struct drbd_conf *mdev, + struct p_header80 *h, struct bm_xfer_ctx *c) { - struct drbd_socket *sock = &mdev->tconn->data; - unsigned int header_size = drbd_header_size(mdev->tconn); - struct p_compressed_bm *p = sock->sbuf + header_size; - int len, err; + struct p_compressed_bm *p = (void*)h; + unsigned long num_words; + int len; + int ok; + + len = fill_bitmap_rle_bits(mdev, p, c); - len = fill_bitmap_rle_bits(mdev, p, - DRBD_SOCKET_BUFFER_SIZE - header_size - sizeof(*p), c); if (len < 0) return -EIO; if (len) { - dcbp_set_code(p, RLE_VLI_Bits); - err = __send_command(mdev->tconn, mdev->vnr, sock, - P_COMPRESSED_BITMAP, sizeof(*p) + len, - NULL, 0); + DCBP_set_code(p, RLE_VLI_Bits); + ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h, + sizeof(*p) + len, 0); + c->packets[0]++; - c->bytes[0] += header_size + sizeof(*p) + len; + c->bytes[0] += sizeof(*p) + len; if (c->bit_offset >= c->bm_bits) len = 0; /* DONE */ } else { /* was not compressible. * send a buffer full of plain text bits instead. */ - unsigned int data_size; - unsigned long num_words; - unsigned long *p = sock->sbuf + header_size; - - data_size = DRBD_SOCKET_BUFFER_SIZE - header_size; - num_words = min_t(size_t, data_size / sizeof(*p), - c->bm_words - c->word_offset); - len = num_words * sizeof(*p); + num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset); + len = num_words * sizeof(long); if (len) - drbd_bm_get_lel(mdev, c->word_offset, num_words, p); - err = __send_command(mdev->tconn, mdev->vnr, sock, P_BITMAP, len, NULL, 0); + drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload); + ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP, + h, sizeof(struct p_header80) + len, 0); c->word_offset += num_words; c->bit_offset = c->word_offset * BITS_PER_LONG; c->packets[1]++; - c->bytes[1] += header_size + len; + c->bytes[1] += sizeof(struct p_header80) + len; if (c->bit_offset > c->bm_bits) c->bit_offset = c->bm_bits; } - if (!err) { + if (ok) { if (len == 0) { INFO_bm_xfer_stats(mdev, "send", c); return 0; @@ -1227,13 +2465,21 @@ send_bitmap_rle_or_plain(struct drbd_conf *mdev, struct bm_xfer_ctx *c) } /* See the comment at receive_bitmap() */ -static int _drbd_send_bitmap(struct drbd_conf *mdev) +int _drbd_send_bitmap(struct drbd_conf *mdev) { struct bm_xfer_ctx c; + struct p_header80 *p; int err; - if (!expect(mdev->bitmap)) + ERR_IF(!mdev->bitmap) return false; + + /* maybe we should use some per thread scratch page, + * and allocate that during initial device creation? */ + p = (struct p_header80 *) __get_free_page(GFP_NOIO); + if (!p) { + dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__); return false; + } if (get_ldev(mdev)) { if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) { @@ -1258,39 +2504,37 @@ static int _drbd_send_bitmap(struct drbd_conf *mdev) }; do { - err = send_bitmap_rle_or_plain(mdev, &c); + err = send_bitmap_rle_or_plain(mdev, p, &c); } while (err > 0); + free_page((unsigned long) p); return err == 0; } int drbd_send_bitmap(struct drbd_conf *mdev) { - struct drbd_socket *sock = &mdev->tconn->data; - int err = -1; + int err; - mutex_lock(&sock->mutex); - if (sock->socket) - err = !_drbd_send_bitmap(mdev); - mutex_unlock(&sock->mutex); + if (!drbd_get_data_sock(mdev)) + return -1; + err = !_drbd_send_bitmap(mdev); + drbd_put_data_sock(mdev); return err; } -void drbd_send_b_ack(struct drbd_tconn *tconn, u32 barrier_nr, u32 set_size) +int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size) { - struct drbd_socket *sock; - struct p_barrier_ack *p; + int ok; + struct p_barrier_ack p; - if (tconn->cstate < C_WF_REPORT_PARAMS) - return; + p.barrier = barrier_nr; + p.set_size = cpu_to_be32(set_size); - sock = &tconn->meta; - p = conn_prepare_command(tconn, sock); - if (!p) - return; - p->barrier = barrier_nr; - p->set_size = cpu_to_be32(set_size); - conn_send_command(tconn, sock, P_BARRIER_ACK, sizeof(*p), NULL, 0); + if (mdev->state.conn < C_CONNECTED) + return false; + ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK, + (struct p_header80 *)&p, sizeof(p)); + return ok; } /** @@ -1301,62 +2545,62 @@ void drbd_send_b_ack(struct drbd_tconn *tconn, u32 barrier_nr, u32 set_size) * @blksize: size in byte, needs to be in big endian byte order * @block_id: Id, big endian byte order */ -static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packet cmd, - u64 sector, u32 blksize, u64 block_id) +static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd, + u64 sector, + u32 blksize, + u64 block_id) { - struct drbd_socket *sock; - struct p_block_ack *p; + int ok; + struct p_block_ack p; - if (mdev->state.conn < C_CONNECTED) - return -EIO; + p.sector = sector; + p.block_id = block_id; + p.blksize = blksize; + p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq)); - sock = &mdev->tconn->meta; - p = drbd_prepare_command(mdev, sock); - if (!p) - return -EIO; - p->sector = sector; - p->block_id = block_id; - p->blksize = blksize; - p->seq_num = cpu_to_be32(atomic_inc_return(&mdev->packet_seq)); - return drbd_send_command(mdev, sock, cmd, sizeof(*p), NULL, 0); + if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED) + return false; + ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd, + (struct p_header80 *)&p, sizeof(p)); + return ok; } /* dp->sector and dp->block_id already/still in network byte order, * data_size is payload size according to dp->head, * and may need to be corrected for digest size. */ -void drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packet cmd, - struct p_data *dp, int data_size) +int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd, + struct p_data *dp, int data_size) { - if (mdev->tconn->peer_integrity_tfm) - data_size -= crypto_hash_digestsize(mdev->tconn->peer_integrity_tfm); - _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size), - dp->block_id); + data_size -= (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ? + crypto_hash_digestsize(mdev->integrity_r_tfm) : 0; + return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size), + dp->block_id); } -void drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packet cmd, - struct p_block_req *rp) +int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd, + struct p_block_req *rp) { - _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id); + return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id); } /** * drbd_send_ack() - Sends an ack packet - * @mdev: DRBD device - * @cmd: packet command code - * @peer_req: peer request + * @mdev: DRBD device. + * @cmd: Packet command code. + * @e: Epoch entry. */ -int drbd_send_ack(struct drbd_conf *mdev, enum drbd_packet cmd, - struct drbd_peer_request *peer_req) +int drbd_send_ack(struct drbd_conf *mdev, + enum drbd_packets cmd, struct drbd_epoch_entry *e) { return _drbd_send_ack(mdev, cmd, - cpu_to_be64(peer_req->i.sector), - cpu_to_be32(peer_req->i.size), - peer_req->block_id); + cpu_to_be64(e->sector), + cpu_to_be32(e->size), + e->block_id); } /* This function misuses the block_id field to signal if the blocks * are is sync or not. */ -int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packet cmd, +int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd, sector_t sector, int blksize, u64 block_id) { return _drbd_send_ack(mdev, cmd, @@ -1368,87 +2612,85 @@ int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packet cmd, int drbd_send_drequest(struct drbd_conf *mdev, int cmd, sector_t sector, int size, u64 block_id) { - struct drbd_socket *sock; - struct p_block_req *p; + int ok; + struct p_block_req p; - sock = &mdev->tconn->data; - p = drbd_prepare_command(mdev, sock); - if (!p) - return -EIO; - p->sector = cpu_to_be64(sector); - p->block_id = block_id; - p->blksize = cpu_to_be32(size); - return drbd_send_command(mdev, sock, cmd, sizeof(*p), NULL, 0); + p.sector = cpu_to_be64(sector); + p.block_id = block_id; + p.blksize = cpu_to_be32(size); + + ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, + (struct p_header80 *)&p, sizeof(p)); + return ok; } -int drbd_send_drequest_csum(struct drbd_conf *mdev, sector_t sector, int size, - void *digest, int digest_size, enum drbd_packet cmd) -{ - struct drbd_socket *sock; - struct p_block_req *p; +int drbd_send_drequest_csum(struct drbd_conf *mdev, + sector_t sector, int size, + void *digest, int digest_size, + enum drbd_packets cmd) +{ + int ok; + struct p_block_req p; + + p.sector = cpu_to_be64(sector); + p.block_id = BE_DRBD_MAGIC + 0xbeef; + p.blksize = cpu_to_be32(size); + + p.head.magic = BE_DRBD_MAGIC; + p.head.command = cpu_to_be16(cmd); + p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + digest_size); + + mutex_lock(&mdev->data.mutex); - /* FIXME: Put the digest into the preallocated socket buffer. */ + ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0)); + ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0)); - sock = &mdev->tconn->data; - p = drbd_prepare_command(mdev, sock); - if (!p) - return -EIO; - p->sector = cpu_to_be64(sector); - p->block_id = ID_SYNCER /* unused */; - p->blksize = cpu_to_be32(size); - return drbd_send_command(mdev, sock, cmd, sizeof(*p), - digest, digest_size); + mutex_unlock(&mdev->data.mutex); + + return ok; } int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size) { - struct drbd_socket *sock; - struct p_block_req *p; + int ok; + struct p_block_req p; - sock = &mdev->tconn->data; - p = drbd_prepare_command(mdev, sock); - if (!p) - return -EIO; - p->sector = cpu_to_be64(sector); - p->block_id = ID_SYNCER /* unused */; - p->blksize = cpu_to_be32(size); - return drbd_send_command(mdev, sock, P_OV_REQUEST, sizeof(*p), NULL, 0); + p.sector = cpu_to_be64(sector); + p.block_id = BE_DRBD_MAGIC + 0xbabe; + p.blksize = cpu_to_be32(size); + + ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST, + (struct p_header80 *)&p, sizeof(p)); + return ok; } /* called on sndtimeo * returns false if we should retry, * true if we think connection is dead */ -static int we_should_drop_the_connection(struct drbd_tconn *tconn, struct socket *sock) +static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock) { int drop_it; /* long elapsed = (long)(jiffies - mdev->last_received); */ - drop_it = tconn->meta.socket == sock - || !tconn->asender.task - || get_t_state(&tconn->asender) != RUNNING - || tconn->cstate < C_WF_REPORT_PARAMS; + drop_it = mdev->meta.socket == sock + || !mdev->asender.task + || get_t_state(&mdev->asender) != Running + || mdev->state.conn < C_CONNECTED; if (drop_it) return true; - drop_it = !--tconn->ko_count; + drop_it = !--mdev->ko_count; if (!drop_it) { - conn_err(tconn, "[%s/%d] sock_sendmsg time expired, ko = %u\n", - current->comm, current->pid, tconn->ko_count); - request_ping(tconn); + dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n", + current->comm, current->pid, mdev->ko_count); + request_ping(mdev); } return drop_it; /* && (mdev->state == R_PRIMARY) */; } -static void drbd_update_congested(struct drbd_tconn *tconn) -{ - struct sock *sk = tconn->data.socket->sk; - if (sk->sk_wmem_queued > sk->sk_sndbuf * 4 / 5) - set_bit(NET_CONGESTED, &tconn->flags); -} - /* The idea of sendpage seems to be to put some kind of reference * to the page into the skb, and to hand it over to the NIC. In * this process get_page() gets called. @@ -1471,28 +2713,21 @@ static void drbd_update_congested(struct drbd_tconn *tconn) * with page_count == 0 or PageSlab. */ static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page, - int offset, size_t size, unsigned msg_flags) + int offset, size_t size, unsigned msg_flags) { - struct socket *socket; - void *addr; - int err; - - socket = mdev->tconn->data.socket; - addr = kmap(page) + offset; - err = drbd_send_all(mdev->tconn, socket, addr, size, msg_flags); + int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, msg_flags); kunmap(page); - if (!err) - mdev->send_cnt += size >> 9; - return err; + if (sent == size) + mdev->send_cnt += size>>9; + return sent == size; } static int _drbd_send_page(struct drbd_conf *mdev, struct page *page, int offset, size_t size, unsigned msg_flags) { - struct socket *socket = mdev->tconn->data.socket; mm_segment_t oldfs = get_fs(); + int sent, ok; int len = size; - int err = -EIO; /* e.g. XFS meta- & log-data is in slab pages, which have a * page_count of 0 and/or have PageSlab() set. @@ -1504,35 +2739,34 @@ static int _drbd_send_page(struct drbd_conf *mdev, struct page *page, return _drbd_no_send_page(mdev, page, offset, size, msg_flags); msg_flags |= MSG_NOSIGNAL; - drbd_update_congested(mdev->tconn); + drbd_update_congested(mdev); set_fs(KERNEL_DS); do { - int sent; - - sent = socket->ops->sendpage(socket, page, offset, len, msg_flags); - if (sent <= 0) { - if (sent == -EAGAIN) { - if (we_should_drop_the_connection(mdev->tconn, socket)) - break; + sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page, + offset, len, + msg_flags); + if (sent == -EAGAIN) { + if (we_should_drop_the_connection(mdev, + mdev->data.socket)) + break; + else continue; - } + } + if (sent <= 0) { dev_warn(DEV, "%s: size=%d len=%d sent=%d\n", __func__, (int)size, len, sent); - if (sent < 0) - err = sent; break; } len -= sent; offset += sent; } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/); set_fs(oldfs); - clear_bit(NET_CONGESTED, &mdev->tconn->flags); + clear_bit(NET_CONGESTED, &mdev->flags); - if (len == 0) { - err = 0; - mdev->send_cnt += size >> 9; - } - return err; + ok = (len == 0); + if (likely(ok)) + mdev->send_cnt += size>>9; + return ok; } static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio) @@ -1541,15 +2775,12 @@ static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio) int i; /* hint all but last page with MSG_MORE */ bio_for_each_segment(bvec, bio, i) { - int err; - - err = _drbd_no_send_page(mdev, bvec->bv_page, - bvec->bv_offset, bvec->bv_len, - i == bio->bi_vcnt - 1 ? 0 : MSG_MORE); - if (err) - return err; + if (!_drbd_no_send_page(mdev, bvec->bv_page, + bvec->bv_offset, bvec->bv_len, + i == bio->bi_vcnt -1 ? 0 : MSG_MORE)) + return 0; } - return 0; + return 1; } static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio) @@ -1558,40 +2789,32 @@ static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio) int i; /* hint all but last page with MSG_MORE */ bio_for_each_segment(bvec, bio, i) { - int err; - - err = _drbd_send_page(mdev, bvec->bv_page, - bvec->bv_offset, bvec->bv_len, - i == bio->bi_vcnt - 1 ? 0 : MSG_MORE); - if (err) - return err; + if (!_drbd_send_page(mdev, bvec->bv_page, + bvec->bv_offset, bvec->bv_len, + i == bio->bi_vcnt -1 ? 0 : MSG_MORE)) + return 0; } - return 0; + return 1; } -static int _drbd_send_zc_ee(struct drbd_conf *mdev, - struct drbd_peer_request *peer_req) +static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e) { - struct page *page = peer_req->pages; - unsigned len = peer_req->i.size; - int err; - + struct page *page = e->pages; + unsigned len = e->size; /* hint all but last page with MSG_MORE */ page_chain_for_each(page) { unsigned l = min_t(unsigned, len, PAGE_SIZE); - - err = _drbd_send_page(mdev, page, 0, l, - page_chain_next(page) ? MSG_MORE : 0); - if (err) - return err; + if (!_drbd_send_page(mdev, page, 0, l, + page_chain_next(page) ? MSG_MORE : 0)) + return 0; len -= l; } - return 0; + return 1; } static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw) { - if (mdev->tconn->agreed_pro_version >= 95) + if (mdev->agreed_pro_version >= 95) return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) | (bi_rw & REQ_FUA ? DP_FUA : 0) | (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) | @@ -1605,36 +2828,50 @@ static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw) */ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req) { - struct drbd_socket *sock; - struct p_data *p; + int ok = 1; + struct p_data p; unsigned int dp_flags = 0; + void *dgb; int dgs; - int err; - sock = &mdev->tconn->data; - p = drbd_prepare_command(mdev, sock); - dgs = mdev->tconn->integrity_tfm ? crypto_hash_digestsize(mdev->tconn->integrity_tfm) : 0; + if (!drbd_get_data_sock(mdev)) + return 0; + + dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ? + crypto_hash_digestsize(mdev->integrity_w_tfm) : 0; + + if (req->size <= DRBD_MAX_SIZE_H80_PACKET) { + p.head.h80.magic = BE_DRBD_MAGIC; + p.head.h80.command = cpu_to_be16(P_DATA); + p.head.h80.length = + cpu_to_be16(sizeof(p) - sizeof(union p_header) + dgs + req->size); + } else { + p.head.h95.magic = BE_DRBD_MAGIC_BIG; + p.head.h95.command = cpu_to_be16(P_DATA); + p.head.h95.length = + cpu_to_be32(sizeof(p) - sizeof(union p_header) + dgs + req->size); + } + + p.sector = cpu_to_be64(req->sector); + p.block_id = (unsigned long)req; + p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq)); - if (!p) - return -EIO; - p->sector = cpu_to_be64(req->i.sector); - p->block_id = (unsigned long)req; - p->seq_num = cpu_to_be32(atomic_inc_return(&mdev->packet_seq)); dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw); + if (mdev->state.conn >= C_SYNC_SOURCE && mdev->state.conn <= C_PAUSED_SYNC_T) dp_flags |= DP_MAY_SET_IN_SYNC; - if (mdev->tconn->agreed_pro_version >= 100) { - if (req->rq_state & RQ_EXP_RECEIVE_ACK) - dp_flags |= DP_SEND_RECEIVE_ACK; - if (req->rq_state & RQ_EXP_WRITE_ACK) - dp_flags |= DP_SEND_WRITE_ACK; - } - p->dp_flags = cpu_to_be32(dp_flags); - if (dgs) - drbd_csum_bio(mdev, mdev->tconn->integrity_tfm, req->master_bio, p + 1); - err = __send_command(mdev->tconn, mdev->vnr, sock, P_DATA, sizeof(*p) + dgs, NULL, req->i.size); - if (!err) { + + p.dp_flags = cpu_to_be32(dp_flags); + set_bit(UNPLUG_REMOTE, &mdev->flags); + ok = (sizeof(p) == + drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0)); + if (ok && dgs) { + dgb = mdev->int_dig_out; + drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb); + ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0); + } + if (ok) { /* For protocol A, we have to memcpy the payload into * socket buffers, as we may complete right away * as soon as we handed it over to tcp, at which point the data @@ -1646,76 +2883,92 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req) * out ok after sending on this side, but does not fit on the * receiving side, we sure have detected corruption elsewhere. */ - if (!(req->rq_state & (RQ_EXP_RECEIVE_ACK | RQ_EXP_WRITE_ACK)) || dgs) - err = _drbd_send_bio(mdev, req->master_bio); + if (mdev->net_conf->wire_protocol == DRBD_PROT_A || dgs) + ok = _drbd_send_bio(mdev, req->master_bio); else - err = _drbd_send_zc_bio(mdev, req->master_bio); + ok = _drbd_send_zc_bio(mdev, req->master_bio); /* double check digest, sometimes buffers have been modified in flight. */ if (dgs > 0 && dgs <= 64) { /* 64 byte, 512 bit, is the largest digest size * currently supported in kernel crypto. */ unsigned char digest[64]; - drbd_csum_bio(mdev, mdev->tconn->integrity_tfm, req->master_bio, digest); - if (memcmp(p + 1, digest, dgs)) { + drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, digest); + if (memcmp(mdev->int_dig_out, digest, dgs)) { dev_warn(DEV, "Digest mismatch, buffer modified by upper layers during write: %llus +%u\n", - (unsigned long long)req->i.sector, req->i.size); + (unsigned long long)req->sector, req->size); } } /* else if (dgs > 64) { ... Be noisy about digest too large ... } */ } - mutex_unlock(&sock->mutex); /* locked by drbd_prepare_command() */ - return err; + drbd_put_data_sock(mdev); + + return ok; } /* answer packet, used to send data back for read requests: * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY) * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY) */ -int drbd_send_block(struct drbd_conf *mdev, enum drbd_packet cmd, - struct drbd_peer_request *peer_req) +int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd, + struct drbd_epoch_entry *e) { - struct drbd_socket *sock; - struct p_data *p; - int err; + int ok; + struct p_data p; + void *dgb; int dgs; - sock = &mdev->tconn->data; - p = drbd_prepare_command(mdev, sock); + dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ? + crypto_hash_digestsize(mdev->integrity_w_tfm) : 0; - dgs = mdev->tconn->integrity_tfm ? crypto_hash_digestsize(mdev->tconn->integrity_tfm) : 0; + if (e->size <= DRBD_MAX_SIZE_H80_PACKET) { + p.head.h80.magic = BE_DRBD_MAGIC; + p.head.h80.command = cpu_to_be16(cmd); + p.head.h80.length = + cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + dgs + e->size); + } else { + p.head.h95.magic = BE_DRBD_MAGIC_BIG; + p.head.h95.command = cpu_to_be16(cmd); + p.head.h95.length = + cpu_to_be32(sizeof(p) - sizeof(struct p_header80) + dgs + e->size); + } - if (!p) - return -EIO; - p->sector = cpu_to_be64(peer_req->i.sector); - p->block_id = peer_req->block_id; - p->seq_num = 0; /* unused */ - p->dp_flags = 0; - if (dgs) - drbd_csum_ee(mdev, mdev->tconn->integrity_tfm, peer_req, p + 1); - err = __send_command(mdev->tconn, mdev->vnr, sock, cmd, sizeof(*p) + dgs, NULL, peer_req->i.size); - if (!err) - err = _drbd_send_zc_ee(mdev, peer_req); - mutex_unlock(&sock->mutex); /* locked by drbd_prepare_command() */ + p.sector = cpu_to_be64(e->sector); + p.block_id = e->block_id; + /* p.seq_num = 0; No sequence numbers here.. */ - return err; + /* Only called by our kernel thread. + * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL + * in response to admin command or module unload. + */ + if (!drbd_get_data_sock(mdev)) + return 0; + + ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0); + if (ok && dgs) { + dgb = mdev->int_dig_out; + drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb); + ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0); + } + if (ok) + ok = _drbd_send_zc_ee(mdev, e); + + drbd_put_data_sock(mdev); + + return ok; } -int drbd_send_out_of_sync(struct drbd_conf *mdev, struct drbd_request *req) +int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req) { - struct drbd_socket *sock; - struct p_block_desc *p; + struct p_block_desc p; - sock = &mdev->tconn->data; - p = drbd_prepare_command(mdev, sock); - if (!p) - return -EIO; - p->sector = cpu_to_be64(req->i.sector); - p->blksize = cpu_to_be32(req->i.size); - return drbd_send_command(mdev, sock, P_OUT_OF_SYNC, sizeof(*p), NULL, 0); + p.sector = cpu_to_be64(req->sector); + p.blksize = cpu_to_be32(req->size); + + return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OUT_OF_SYNC, &p.head, sizeof(p)); } /* @@ -1734,7 +2987,7 @@ int drbd_send_out_of_sync(struct drbd_conf *mdev, struct drbd_request *req) /* * you must have down()ed the appropriate [m]sock_mutex elsewhere! */ -int drbd_send(struct drbd_tconn *tconn, struct socket *sock, +int drbd_send(struct drbd_conf *mdev, struct socket *sock, void *buf, size_t size, unsigned msg_flags) { struct kvec iov; @@ -1742,7 +2995,7 @@ int drbd_send(struct drbd_tconn *tconn, struct socket *sock, int rv, sent = 0; if (!sock) - return -EBADR; + return -1000; /* THINK if (signal_pending) return ... ? */ @@ -1755,11 +3008,9 @@ int drbd_send(struct drbd_tconn *tconn, struct socket *sock, msg.msg_controllen = 0; msg.msg_flags = msg_flags | MSG_NOSIGNAL; - if (sock == tconn->data.socket) { - rcu_read_lock(); - tconn->ko_count = rcu_dereference(tconn->net_conf)->ko_count; - rcu_read_unlock(); - drbd_update_congested(tconn); + if (sock == mdev->data.socket) { + mdev->ko_count = mdev->net_conf->ko_count; + drbd_update_congested(mdev); } do { /* STRANGE @@ -1773,11 +3024,12 @@ int drbd_send(struct drbd_tconn *tconn, struct socket *sock, */ rv = kernel_sendmsg(sock, &msg, &iov, 1, size); if (rv == -EAGAIN) { - if (we_should_drop_the_connection(tconn, sock)) + if (we_should_drop_the_connection(mdev, sock)) break; else continue; } + D_ASSERT(rv != 0); if (rv == -EINTR) { flush_signals(current); rv = 0; @@ -1789,40 +3041,22 @@ int drbd_send(struct drbd_tconn *tconn, struct socket *sock, iov.iov_len -= rv; } while (sent < size); - if (sock == tconn->data.socket) - clear_bit(NET_CONGESTED, &tconn->flags); + if (sock == mdev->data.socket) + clear_bit(NET_CONGESTED, &mdev->flags); if (rv <= 0) { if (rv != -EAGAIN) { - conn_err(tconn, "%s_sendmsg returned %d\n", - sock == tconn->meta.socket ? "msock" : "sock", - rv); - conn_request_state(tconn, NS(conn, C_BROKEN_PIPE), CS_HARD); + dev_err(DEV, "%s_sendmsg returned %d\n", + sock == mdev->meta.socket ? "msock" : "sock", + rv); + drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE)); } else - conn_request_state(tconn, NS(conn, C_TIMEOUT), CS_HARD); + drbd_force_state(mdev, NS(conn, C_TIMEOUT)); } return sent; } -/** - * drbd_send_all - Send an entire buffer - * - * Returns 0 upon success and a negative error value otherwise. - */ -int drbd_send_all(struct drbd_tconn *tconn, struct socket *sock, void *buffer, - size_t size, unsigned msg_flags) -{ - int err; - - err = drbd_send(tconn, sock, buffer, size, msg_flags); - if (err < 0) - return err; - if (err != size) - return -EIO; - return 0; -} - static int drbd_open(struct block_device *bdev, fmode_t mode) { struct drbd_conf *mdev = bdev->bd_disk->private_data; @@ -1830,7 +3064,7 @@ static int drbd_open(struct block_device *bdev, fmode_t mode) int rv = 0; mutex_lock(&drbd_main_mutex); - spin_lock_irqsave(&mdev->tconn->req_lock, flags); + spin_lock_irqsave(&mdev->req_lock, flags); /* to have a stable mdev->state.role * and no race with updating open_cnt */ @@ -1843,7 +3077,7 @@ static int drbd_open(struct block_device *bdev, fmode_t mode) if (!rv) mdev->open_cnt++; - spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); + spin_unlock_irqrestore(&mdev->req_lock, flags); mutex_unlock(&drbd_main_mutex); return rv; @@ -1860,14 +3094,35 @@ static int drbd_release(struct gendisk *gd, fmode_t mode) static void drbd_set_defaults(struct drbd_conf *mdev) { - /* Beware! The actual layout differs - * between big endian and little endian */ - mdev->state = (union drbd_dev_state) { + /* This way we get a compile error when sync_conf grows, + and we forgot to initialize it here */ + mdev->sync_conf = (struct syncer_conf) { + /* .rate = */ DRBD_RATE_DEF, + /* .after = */ DRBD_AFTER_DEF, + /* .al_extents = */ DRBD_AL_EXTENTS_DEF, + /* .verify_alg = */ {}, 0, + /* .cpu_mask = */ {}, 0, + /* .csums_alg = */ {}, 0, + /* .use_rle = */ 0, + /* .on_no_data = */ DRBD_ON_NO_DATA_DEF, + /* .c_plan_ahead = */ DRBD_C_PLAN_AHEAD_DEF, + /* .c_delay_target = */ DRBD_C_DELAY_TARGET_DEF, + /* .c_fill_target = */ DRBD_C_FILL_TARGET_DEF, + /* .c_max_rate = */ DRBD_C_MAX_RATE_DEF, + /* .c_min_rate = */ DRBD_C_MIN_RATE_DEF + }; + + /* Have to use that way, because the layout differs between + big endian and little endian */ + mdev->state = (union drbd_state) { { .role = R_SECONDARY, .peer = R_UNKNOWN, .conn = C_STANDALONE, .disk = D_DISKLESS, .pdsk = D_UNKNOWN, + .susp = 0, + .susp_nod = 0, + .susp_fen = 0 } }; } @@ -1883,17 +3138,28 @@ void drbd_init_set_defaults(struct drbd_conf *mdev) atomic_set(&mdev->rs_pending_cnt, 0); atomic_set(&mdev->unacked_cnt, 0); atomic_set(&mdev->local_cnt, 0); + atomic_set(&mdev->net_cnt, 0); + atomic_set(&mdev->packet_seq, 0); + atomic_set(&mdev->pp_in_use, 0); atomic_set(&mdev->pp_in_use_by_net, 0); atomic_set(&mdev->rs_sect_in, 0); atomic_set(&mdev->rs_sect_ev, 0); atomic_set(&mdev->ap_in_flight, 0); atomic_set(&mdev->md_io_in_use, 0); - mutex_init(&mdev->own_state_mutex); - mdev->state_mutex = &mdev->own_state_mutex; + mutex_init(&mdev->data.mutex); + mutex_init(&mdev->meta.mutex); + sema_init(&mdev->data.work.s, 0); + sema_init(&mdev->meta.work.s, 0); + mutex_init(&mdev->state_mutex); + + spin_lock_init(&mdev->data.work.q_lock); + spin_lock_init(&mdev->meta.work.q_lock); spin_lock_init(&mdev->al_lock); + spin_lock_init(&mdev->req_lock); spin_lock_init(&mdev->peer_seq_lock); + spin_lock_init(&mdev->epoch_lock); INIT_LIST_HEAD(&mdev->active_ee); INIT_LIST_HEAD(&mdev->sync_ee); @@ -1901,6 +3167,8 @@ void drbd_init_set_defaults(struct drbd_conf *mdev) INIT_LIST_HEAD(&mdev->read_ee); INIT_LIST_HEAD(&mdev->net_ee); INIT_LIST_HEAD(&mdev->resync_reads); + INIT_LIST_HEAD(&mdev->data.work.q); + INIT_LIST_HEAD(&mdev->meta.work.q); INIT_LIST_HEAD(&mdev->resync_work.list); INIT_LIST_HEAD(&mdev->unplug_work.list); INIT_LIST_HEAD(&mdev->go_diskless.list); @@ -1914,14 +3182,6 @@ void drbd_init_set_defaults(struct drbd_conf *mdev) mdev->md_sync_work.cb = w_md_sync; mdev->bm_io_work.w.cb = w_bitmap_io; mdev->start_resync_work.cb = w_start_resync; - - mdev->resync_work.mdev = mdev; - mdev->unplug_work.mdev = mdev; - mdev->go_diskless.mdev = mdev; - mdev->md_sync_work.mdev = mdev; - mdev->bm_io_work.w.mdev = mdev; - mdev->start_resync_work.mdev = mdev; - init_timer(&mdev->resync_timer); init_timer(&mdev->md_sync_timer); init_timer(&mdev->start_resync_timer); @@ -1937,10 +3197,17 @@ void drbd_init_set_defaults(struct drbd_conf *mdev) init_waitqueue_head(&mdev->misc_wait); init_waitqueue_head(&mdev->state_wait); + init_waitqueue_head(&mdev->net_cnt_wait); init_waitqueue_head(&mdev->ee_wait); init_waitqueue_head(&mdev->al_wait); init_waitqueue_head(&mdev->seq_wait); + drbd_thread_init(mdev, &mdev->receiver, drbdd_init); + drbd_thread_init(mdev, &mdev->worker, drbd_worker); + drbd_thread_init(mdev, &mdev->asender, drbd_asender); + + mdev->agreed_pro_version = PRO_VERSION_MAX; + mdev->write_ordering = WO_bdev_flush; mdev->resync_wenr = LC_FREE; mdev->peer_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE; mdev->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE; @@ -1949,10 +3216,13 @@ void drbd_init_set_defaults(struct drbd_conf *mdev) void drbd_mdev_cleanup(struct drbd_conf *mdev) { int i; - if (mdev->tconn->receiver.t_state != NONE) + if (mdev->receiver.t_state != None) dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n", - mdev->tconn->receiver.t_state); + mdev->receiver.t_state); + /* no need to lock it, I'm the only thread alive */ + if (atomic_read(&mdev->current_epoch->epoch_size) != 0) + dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size)); mdev->al_writ_cnt = mdev->bm_writ_cnt = mdev->read_cnt = @@ -1969,7 +3239,7 @@ void drbd_mdev_cleanup(struct drbd_conf *mdev) mdev->rs_mark_left[i] = 0; mdev->rs_mark_time[i] = 0; } - D_ASSERT(mdev->tconn->net_conf == NULL); + D_ASSERT(mdev->net_conf == NULL); drbd_set_my_capacity(mdev, 0); if (mdev->bitmap) { @@ -1978,18 +3248,21 @@ void drbd_mdev_cleanup(struct drbd_conf *mdev) drbd_bm_cleanup(mdev); } - drbd_free_bc(mdev->ldev); - mdev->ldev = NULL; - + drbd_free_resources(mdev); clear_bit(AL_SUSPENDED, &mdev->flags); + /* + * currently we drbd_init_ee only on module load, so + * we may do drbd_release_ee only on module unload! + */ D_ASSERT(list_empty(&mdev->active_ee)); D_ASSERT(list_empty(&mdev->sync_ee)); D_ASSERT(list_empty(&mdev->done_ee)); D_ASSERT(list_empty(&mdev->read_ee)); D_ASSERT(list_empty(&mdev->net_ee)); D_ASSERT(list_empty(&mdev->resync_reads)); - D_ASSERT(list_empty(&mdev->tconn->sender_work.q)); + D_ASSERT(list_empty(&mdev->data.work.q)); + D_ASSERT(list_empty(&mdev->meta.work.q)); D_ASSERT(list_empty(&mdev->resync_work.list)); D_ASSERT(list_empty(&mdev->unplug_work.list)); D_ASSERT(list_empty(&mdev->go_diskless.list)); @@ -2063,7 +3336,7 @@ static int drbd_create_mempools(void) goto Enomem; drbd_ee_cache = kmem_cache_create( - "drbd_ee", sizeof(struct drbd_peer_request), 0, 0, NULL); + "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL); if (drbd_ee_cache == NULL) goto Enomem; @@ -2078,9 +3351,11 @@ static int drbd_create_mempools(void) goto Enomem; /* mempools */ +#ifdef COMPAT_HAVE_BIOSET_CREATE drbd_md_io_bio_set = bioset_create(DRBD_MIN_POOL_PAGES, 0); if (drbd_md_io_bio_set == NULL) goto Enomem; +#endif drbd_md_io_page_pool = mempool_create_page_pool(DRBD_MIN_POOL_PAGES, 0); if (drbd_md_io_page_pool == NULL) @@ -2129,53 +3404,73 @@ static struct notifier_block drbd_notifier = { .notifier_call = drbd_notify_sys, }; -static void drbd_release_all_peer_reqs(struct drbd_conf *mdev) +static void drbd_release_ee_lists(struct drbd_conf *mdev) { int rr; - rr = drbd_free_peer_reqs(mdev, &mdev->active_ee); + rr = drbd_release_ee(mdev, &mdev->active_ee); if (rr) dev_err(DEV, "%d EEs in active list found!\n", rr); - rr = drbd_free_peer_reqs(mdev, &mdev->sync_ee); + rr = drbd_release_ee(mdev, &mdev->sync_ee); if (rr) dev_err(DEV, "%d EEs in sync list found!\n", rr); - rr = drbd_free_peer_reqs(mdev, &mdev->read_ee); + rr = drbd_release_ee(mdev, &mdev->read_ee); if (rr) dev_err(DEV, "%d EEs in read list found!\n", rr); - rr = drbd_free_peer_reqs(mdev, &mdev->done_ee); + rr = drbd_release_ee(mdev, &mdev->done_ee); if (rr) dev_err(DEV, "%d EEs in done list found!\n", rr); - rr = drbd_free_peer_reqs(mdev, &mdev->net_ee); + rr = drbd_release_ee(mdev, &mdev->net_ee); if (rr) dev_err(DEV, "%d EEs in net list found!\n", rr); } -/* caution. no locking. */ -void drbd_minor_destroy(struct kref *kref) +/* caution. no locking. + * currently only used from module cleanup code. */ +static void drbd_delete_device(unsigned int minor) { - struct drbd_conf *mdev = container_of(kref, struct drbd_conf, kref); - struct drbd_tconn *tconn = mdev->tconn; + struct drbd_conf *mdev = minor_to_mdev(minor); + + if (!mdev) + return; del_timer_sync(&mdev->request_timer); /* paranoia asserts */ - D_ASSERT(mdev->open_cnt == 0); + if (mdev->open_cnt != 0) + dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt, + __FILE__ , __LINE__); + + ERR_IF (!list_empty(&mdev->data.work.q)) { + struct list_head *lp; + list_for_each(lp, &mdev->data.work.q) { + dev_err(DEV, "lp = %p\n", lp); + } + }; /* end paranoia asserts */ + del_gendisk(mdev->vdisk); + /* cleanup stuff that may have been allocated during * device (re-)configuration or state changes */ if (mdev->this_bdev) bdput(mdev->this_bdev); - drbd_free_bc(mdev->ldev); - mdev->ldev = NULL; + drbd_free_resources(mdev); - drbd_release_all_peer_reqs(mdev); + drbd_release_ee_lists(mdev); + + /* should be freed on disconnect? */ + kfree(mdev->ee_hash); + /* + mdev->ee_hash_s = 0; + mdev->ee_hash = NULL; + */ lc_destroy(mdev->act_log); lc_destroy(mdev->resync); @@ -2183,101 +3478,19 @@ void drbd_minor_destroy(struct kref *kref) kfree(mdev->p_uuid); /* mdev->p_uuid = NULL; */ - if (mdev->bitmap) /* should no longer be there. */ - drbd_bm_cleanup(mdev); - __free_page(mdev->md_io_page); - put_disk(mdev->vdisk); - blk_cleanup_queue(mdev->rq_queue); - kfree(mdev->rs_plan_s); - kfree(mdev); - - kref_put(&tconn->kref, &conn_destroy); -} - -/* One global retry thread, if we need to push back some bio and have it - * reinserted through our make request function. - */ -static struct retry_worker { - struct workqueue_struct *wq; - struct work_struct worker; - - spinlock_t lock; - struct list_head writes; -} retry; - -static void do_retry(struct work_struct *ws) -{ - struct retry_worker *retry = container_of(ws, struct retry_worker, worker); - LIST_HEAD(writes); - struct drbd_request *req, *tmp; - - spin_lock_irq(&retry->lock); - list_splice_init(&retry->writes, &writes); - spin_unlock_irq(&retry->lock); - - list_for_each_entry_safe(req, tmp, &writes, tl_requests) { - struct drbd_conf *mdev = req->w.mdev; - struct bio *bio = req->master_bio; - unsigned long start_time = req->start_time; - bool expected; - - expected = - expect(atomic_read(&req->completion_ref) == 0) && - expect(req->rq_state & RQ_POSTPONED) && - expect((req->rq_state & RQ_LOCAL_PENDING) == 0 || - (req->rq_state & RQ_LOCAL_ABORTED) != 0); - - if (!expected) - dev_err(DEV, "req=%p completion_ref=%d rq_state=%x\n", - req, atomic_read(&req->completion_ref), - req->rq_state); - - /* We still need to put one kref associated with the - * "completion_ref" going zero in the code path that queued it - * here. The request object may still be referenced by a - * frozen local req->private_bio, in case we force-detached. - */ - kref_put(&req->kref, drbd_req_destroy); - - /* A single suspended or otherwise blocking device may stall - * all others as well. Fortunately, this code path is to - * recover from a situation that "should not happen": - * concurrent writes in multi-primary setup. - * In a "normal" lifecycle, this workqueue is supposed to be - * destroyed without ever doing anything. - * If it turns out to be an issue anyways, we can do per - * resource (replication group) or per device (minor) retry - * workqueues instead. - */ - - /* We are not just doing generic_make_request(), - * as we want to keep the start_time information. */ - inc_ap_bio(mdev); - __drbd_make_request(mdev, bio, start_time); - } -} - -void drbd_restart_request(struct drbd_request *req) -{ - unsigned long flags; - spin_lock_irqsave(&retry.lock, flags); - list_move_tail(&req->tl_requests, &retry.writes); - spin_unlock_irqrestore(&retry.lock, flags); - - /* Drop the extra reference that would otherwise - * have been dropped by complete_master_bio. - * do_retry() needs to grab a new one. */ - dec_ap_bio(req->w.mdev); + kfree(mdev->int_dig_out); + kfree(mdev->int_dig_in); + kfree(mdev->int_dig_vv); - queue_work(retry.wq, &retry.worker); + /* cleanup the rest that has been + * allocated from drbd_new_device + * and actually free the mdev itself */ + drbd_free_mdev(mdev); } - static void drbd_cleanup(void) { unsigned int i; - struct drbd_conf *mdev; - struct drbd_tconn *tconn, *tmp; unregister_reboot_notifier(&drbd_notifier); @@ -2292,31 +3505,19 @@ static void drbd_cleanup(void) if (drbd_proc) remove_proc_entry("drbd", NULL); - if (retry.wq) - destroy_workqueue(retry.wq); - - drbd_genl_unregister(); + drbd_nl_cleanup(); - idr_for_each_entry(&minors, mdev, i) { - idr_remove(&minors, mdev_to_minor(mdev)); - idr_remove(&mdev->tconn->volumes, mdev->vnr); - del_gendisk(mdev->vdisk); - /* synchronize_rcu(); No other threads running at this point */ - kref_put(&mdev->kref, &drbd_minor_destroy); + if (minor_table) { + i = minor_count; + while (i--) + drbd_delete_device(i); + drbd_destroy_mempools(); } - /* not _rcu since, no other updater anymore. Genl already unregistered */ - list_for_each_entry_safe(tconn, tmp, &drbd_tconns, all_tconn) { - list_del(&tconn->all_tconn); /* not _rcu no proc, not other threads */ - /* synchronize_rcu(); */ - kref_put(&tconn->kref, &conn_destroy); - } + kfree(minor_table); - drbd_destroy_mempools(); unregister_blkdev(DRBD_MAJOR, "drbd"); - idr_destroy(&minors); - printk(KERN_INFO "drbd: module cleanup done.\n"); } @@ -2341,7 +3542,7 @@ static int drbd_congested(void *congested_data, int bdi_bits) goto out; } - if (test_bit(CALLBACK_PENDING, &mdev->tconn->flags)) { + if (test_bit(CALLBACK_PENDING, &mdev->flags)) { r |= (1 << BDI_async_congested); /* Without good local data, we would need to read from remote, * and that would need the worker thread as well, which is @@ -2365,7 +3566,7 @@ static int drbd_congested(void *congested_data, int bdi_bits) reason = 'b'; } - if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->tconn->flags)) { + if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) { r |= (1 << BDI_async_congested); reason = reason == 'b' ? 'a' : 'n'; } @@ -2375,243 +3576,20 @@ static int drbd_congested(void *congested_data, int bdi_bits) return r; } -static void drbd_init_workqueue(struct drbd_work_queue* wq) -{ - spin_lock_init(&wq->q_lock); - INIT_LIST_HEAD(&wq->q); - init_waitqueue_head(&wq->q_wait); -} - -struct drbd_tconn *conn_get_by_name(const char *name) -{ - struct drbd_tconn *tconn; - - if (!name || !name[0]) - return NULL; - - rcu_read_lock(); - list_for_each_entry_rcu(tconn, &drbd_tconns, all_tconn) { - if (!strcmp(tconn->name, name)) { - kref_get(&tconn->kref); - goto found; - } - } - tconn = NULL; -found: - rcu_read_unlock(); - return tconn; -} - -struct drbd_tconn *conn_get_by_addrs(void *my_addr, int my_addr_len, - void *peer_addr, int peer_addr_len) -{ - struct drbd_tconn *tconn; - - rcu_read_lock(); - list_for_each_entry_rcu(tconn, &drbd_tconns, all_tconn) { - if (tconn->my_addr_len == my_addr_len && - tconn->peer_addr_len == peer_addr_len && - !memcmp(&tconn->my_addr, my_addr, my_addr_len) && - !memcmp(&tconn->peer_addr, peer_addr, peer_addr_len)) { - kref_get(&tconn->kref); - goto found; - } - } - tconn = NULL; -found: - rcu_read_unlock(); - return tconn; -} - -static int drbd_alloc_socket(struct drbd_socket *socket) -{ - socket->rbuf = (void *) __get_free_page(GFP_KERNEL); - if (!socket->rbuf) - return -ENOMEM; - socket->sbuf = (void *) __get_free_page(GFP_KERNEL); - if (!socket->sbuf) - return -ENOMEM; - return 0; -} - -static void drbd_free_socket(struct drbd_socket *socket) -{ - free_page((unsigned long) socket->sbuf); - free_page((unsigned long) socket->rbuf); -} - -void conn_free_crypto(struct drbd_tconn *tconn) -{ - drbd_free_sock(tconn); - - crypto_free_hash(tconn->csums_tfm); - crypto_free_hash(tconn->verify_tfm); - crypto_free_hash(tconn->cram_hmac_tfm); - crypto_free_hash(tconn->integrity_tfm); - crypto_free_hash(tconn->peer_integrity_tfm); - kfree(tconn->int_dig_in); - kfree(tconn->int_dig_vv); - - tconn->csums_tfm = NULL; - tconn->verify_tfm = NULL; - tconn->cram_hmac_tfm = NULL; - tconn->integrity_tfm = NULL; - tconn->peer_integrity_tfm = NULL; - tconn->int_dig_in = NULL; - tconn->int_dig_vv = NULL; -} - -int set_resource_options(struct drbd_tconn *tconn, struct res_opts *res_opts) -{ - cpumask_var_t new_cpu_mask; - int err; - - if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL)) - return -ENOMEM; - /* - retcode = ERR_NOMEM; - drbd_msg_put_info("unable to allocate cpumask"); - */ - - /* silently ignore cpu mask on UP kernel */ - if (nr_cpu_ids > 1 && res_opts->cpu_mask[0] != 0) { - /* FIXME: Get rid of constant 32 here */ - err = bitmap_parse(res_opts->cpu_mask, 32, - cpumask_bits(new_cpu_mask), nr_cpu_ids); - if (err) { - conn_warn(tconn, "bitmap_parse() failed with %d\n", err); - /* retcode = ERR_CPU_MASK_PARSE; */ - goto fail; - } - } - tconn->res_opts = *res_opts; - if (!cpumask_equal(tconn->cpu_mask, new_cpu_mask)) { - cpumask_copy(tconn->cpu_mask, new_cpu_mask); - drbd_calc_cpu_mask(tconn); - tconn->receiver.reset_cpu_mask = 1; - tconn->asender.reset_cpu_mask = 1; - tconn->worker.reset_cpu_mask = 1; - } - err = 0; - -fail: - free_cpumask_var(new_cpu_mask); - return err; - -} - -/* caller must be under genl_lock() */ -struct drbd_tconn *conn_create(const char *name, struct res_opts *res_opts) -{ - struct drbd_tconn *tconn; - - tconn = kzalloc(sizeof(struct drbd_tconn), GFP_KERNEL); - if (!tconn) - return NULL; - - tconn->name = kstrdup(name, GFP_KERNEL); - if (!tconn->name) - goto fail; - - if (drbd_alloc_socket(&tconn->data)) - goto fail; - if (drbd_alloc_socket(&tconn->meta)) - goto fail; - - if (!zalloc_cpumask_var(&tconn->cpu_mask, GFP_KERNEL)) - goto fail; - - if (set_resource_options(tconn, res_opts)) - goto fail; - - tconn->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL); - if (!tconn->current_epoch) - goto fail; - - INIT_LIST_HEAD(&tconn->transfer_log); - - INIT_LIST_HEAD(&tconn->current_epoch->list); - tconn->epochs = 1; - spin_lock_init(&tconn->epoch_lock); - tconn->write_ordering = WO_bdev_flush; - - tconn->send.seen_any_write_yet = false; - tconn->send.current_epoch_nr = 0; - tconn->send.current_epoch_writes = 0; - - tconn->cstate = C_STANDALONE; - mutex_init(&tconn->cstate_mutex); - spin_lock_init(&tconn->req_lock); - mutex_init(&tconn->conf_update); - init_waitqueue_head(&tconn->ping_wait); - idr_init(&tconn->volumes); - - drbd_init_workqueue(&tconn->sender_work); - mutex_init(&tconn->data.mutex); - mutex_init(&tconn->meta.mutex); - - drbd_thread_init(tconn, &tconn->receiver, drbdd_init, "receiver"); - drbd_thread_init(tconn, &tconn->worker, drbd_worker, "worker"); - drbd_thread_init(tconn, &tconn->asender, drbd_asender, "asender"); - - kref_init(&tconn->kref); - list_add_tail_rcu(&tconn->all_tconn, &drbd_tconns); - - return tconn; - -fail: - kfree(tconn->current_epoch); - free_cpumask_var(tconn->cpu_mask); - drbd_free_socket(&tconn->meta); - drbd_free_socket(&tconn->data); - kfree(tconn->name); - kfree(tconn); - - return NULL; -} - -void conn_destroy(struct kref *kref) -{ - struct drbd_tconn *tconn = container_of(kref, struct drbd_tconn, kref); - - if (atomic_read(&tconn->current_epoch->epoch_size) != 0) - conn_err(tconn, "epoch_size:%d\n", atomic_read(&tconn->current_epoch->epoch_size)); - kfree(tconn->current_epoch); - - idr_destroy(&tconn->volumes); - - free_cpumask_var(tconn->cpu_mask); - drbd_free_socket(&tconn->meta); - drbd_free_socket(&tconn->data); - kfree(tconn->name); - kfree(tconn->int_dig_in); - kfree(tconn->int_dig_vv); - kfree(tconn); -} - -enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor, int vnr) +struct drbd_conf *drbd_new_device(unsigned int minor) { struct drbd_conf *mdev; struct gendisk *disk; struct request_queue *q; - int vnr_got = vnr; - int minor_got = minor; - enum drbd_ret_code err = ERR_NOMEM; - - mdev = minor_to_mdev(minor); - if (mdev) - return ERR_MINOR_EXISTS; /* GFP_KERNEL, we are outside of all write-out paths */ mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL); if (!mdev) - return ERR_NOMEM; - - kref_get(&tconn->kref); - mdev->tconn = tconn; + return NULL; + if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL)) + goto out_no_cpumask; mdev->minor = minor; - mdev->vnr = vnr; drbd_init_set_defaults(mdev); @@ -2649,7 +3627,7 @@ enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor, blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8); blk_queue_bounce_limit(q, BLK_BOUNCE_ANY); blk_queue_merge_bvec(q, drbd_merge_bvec); - q->queue_lock = &mdev->tconn->req_lock; /* needed since we use */ + q->queue_lock = &mdev->req_lock; mdev->md_io_page = alloc_page(GFP_KERNEL); if (!mdev->md_io_page) @@ -2657,44 +3635,30 @@ enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor, if (drbd_bm_init(mdev)) goto out_no_bitmap; - mdev->read_requests = RB_ROOT; - mdev->write_requests = RB_ROOT; - - if (!idr_pre_get(&minors, GFP_KERNEL)) - goto out_no_minor_idr; - if (idr_get_new_above(&minors, mdev, minor, &minor_got)) - goto out_no_minor_idr; - if (minor_got != minor) { - err = ERR_MINOR_EXISTS; - drbd_msg_put_info("requested minor exists already"); - goto out_idr_remove_minor; - } - - if (!idr_pre_get(&tconn->volumes, GFP_KERNEL)) - goto out_idr_remove_minor; - if (idr_get_new_above(&tconn->volumes, mdev, vnr, &vnr_got)) - goto out_idr_remove_minor; - if (vnr_got != vnr) { - err = ERR_INVALID_REQUEST; - drbd_msg_put_info("requested volume exists already"); - goto out_idr_remove_vol; - } - add_disk(disk); - kref_init(&mdev->kref); /* one ref for both idrs and the the add_disk */ - - /* inherit the connection state */ - mdev->state.conn = tconn->cstate; - if (mdev->state.conn == C_WF_REPORT_PARAMS) - drbd_connected(mdev); - - return NO_ERROR; - -out_idr_remove_vol: - idr_remove(&tconn->volumes, vnr_got); -out_idr_remove_minor: - idr_remove(&minors, minor_got); - synchronize_rcu(); -out_no_minor_idr: + /* no need to lock access, we are still initializing this minor device. */ + if (!tl_init(mdev)) + goto out_no_tl; + + mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL); + if (!mdev->app_reads_hash) + goto out_no_app_reads; + + mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL); + if (!mdev->current_epoch) + goto out_no_epoch; + + INIT_LIST_HEAD(&mdev->current_epoch->list); + mdev->epochs = 1; + + return mdev; + +/* out_whatever_else: + kfree(mdev->current_epoch); */ +out_no_epoch: + kfree(mdev->app_reads_hash); +out_no_app_reads: + tl_cleanup(mdev); +out_no_tl: drbd_bm_cleanup(mdev); out_no_bitmap: __free_page(mdev->md_io_page); @@ -2703,25 +3667,55 @@ enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor, out_no_disk: blk_cleanup_queue(q); out_no_q: + free_cpumask_var(mdev->cpu_mask); +out_no_cpumask: + kfree(mdev); + return NULL; +} + +/* counterpart of drbd_new_device. + * last part of drbd_delete_device. */ +void drbd_free_mdev(struct drbd_conf *mdev) +{ + kfree(mdev->current_epoch); + kfree(mdev->app_reads_hash); + tl_cleanup(mdev); + if (mdev->bitmap) /* should no longer be there. */ + drbd_bm_cleanup(mdev); + __free_page(mdev->md_io_page); + put_disk(mdev->vdisk); + blk_cleanup_queue(mdev->rq_queue); + free_cpumask_var(mdev->cpu_mask); + drbd_free_tl_hash(mdev); kfree(mdev); - kref_put(&tconn->kref, &conn_destroy); - return err; } + int __init drbd_init(void) { int err; + if (sizeof(struct p_handshake) != 80) { + printk(KERN_ERR + "drbd: never change the size or layout " + "of the HandShake packet.\n"); + return -EINVAL; + } + if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) { printk(KERN_ERR - "drbd: invalid minor_count (%d)\n", minor_count); + "drbd: invalid minor_count (%d)\n", minor_count); #ifdef MODULE return -EINVAL; #else - minor_count = DRBD_MINOR_COUNT_DEF; + minor_count = 8; #endif } + err = drbd_nl_init(); + if (err) + return err; + err = register_blkdev(DRBD_MAJOR, "drbd"); if (err) { printk(KERN_ERR @@ -2730,13 +3724,6 @@ int __init drbd_init(void) return err; } - err = drbd_genl_register(); - if (err) { - printk(KERN_ERR "drbd: unable to register generic netlink family\n"); - goto fail; - } - - register_reboot_notifier(&drbd_notifier); /* @@ -2747,29 +3734,22 @@ int __init drbd_init(void) init_waitqueue_head(&drbd_pp_wait); drbd_proc = NULL; /* play safe for drbd_cleanup */ - idr_init(&minors); + minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count, + GFP_KERNEL); + if (!minor_table) + goto Enomem; err = drbd_create_mempools(); if (err) - goto fail; + goto Enomem; drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL); if (!drbd_proc) { printk(KERN_ERR "drbd: unable to register proc file\n"); - goto fail; + goto Enomem; } rwlock_init(&global_state_lock); - INIT_LIST_HEAD(&drbd_tconns); - - retry.wq = create_singlethread_workqueue("drbd-reissue"); - if (!retry.wq) { - printk(KERN_ERR "drbd: unable to create retry workqueue\n"); - goto fail; - } - INIT_WORK(&retry.worker, do_retry); - spin_lock_init(&retry.lock); - INIT_LIST_HEAD(&retry.writes); printk(KERN_INFO "drbd: initialized. " "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n", @@ -2777,10 +3757,11 @@ int __init drbd_init(void) printk(KERN_INFO "drbd: %s\n", drbd_buildtag()); printk(KERN_INFO "drbd: registered as block device major %d\n", DRBD_MAJOR); + printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table); return 0; /* Success! */ -fail: +Enomem: drbd_cleanup(); if (err == -ENOMEM) /* currently always the case */ @@ -2801,42 +3782,47 @@ void drbd_free_bc(struct drbd_backing_dev *ldev) kfree(ldev); } -void drbd_free_sock(struct drbd_tconn *tconn) +void drbd_free_sock(struct drbd_conf *mdev) { - if (tconn->data.socket) { - mutex_lock(&tconn->data.mutex); - kernel_sock_shutdown(tconn->data.socket, SHUT_RDWR); - sock_release(tconn->data.socket); - tconn->data.socket = NULL; - mutex_unlock(&tconn->data.mutex); + if (mdev->data.socket) { + mutex_lock(&mdev->data.mutex); + kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR); + sock_release(mdev->data.socket); + mdev->data.socket = NULL; + mutex_unlock(&mdev->data.mutex); } - if (tconn->meta.socket) { - mutex_lock(&tconn->meta.mutex); - kernel_sock_shutdown(tconn->meta.socket, SHUT_RDWR); - sock_release(tconn->meta.socket); - tconn->meta.socket = NULL; - mutex_unlock(&tconn->meta.mutex); + if (mdev->meta.socket) { + mutex_lock(&mdev->meta.mutex); + kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR); + sock_release(mdev->meta.socket); + mdev->meta.socket = NULL; + mutex_unlock(&mdev->meta.mutex); } } -/* meta data management */ -void conn_md_sync(struct drbd_tconn *tconn) +void drbd_free_resources(struct drbd_conf *mdev) { - struct drbd_conf *mdev; - int vnr; + crypto_free_hash(mdev->csums_tfm); + mdev->csums_tfm = NULL; + crypto_free_hash(mdev->verify_tfm); + mdev->verify_tfm = NULL; + crypto_free_hash(mdev->cram_hmac_tfm); + mdev->cram_hmac_tfm = NULL; + crypto_free_hash(mdev->integrity_w_tfm); + mdev->integrity_w_tfm = NULL; + crypto_free_hash(mdev->integrity_r_tfm); + mdev->integrity_r_tfm = NULL; - rcu_read_lock(); - idr_for_each_entry(&tconn->volumes, mdev, vnr) { - kref_get(&mdev->kref); - rcu_read_unlock(); - drbd_md_sync(mdev); - kref_put(&mdev->kref, &drbd_minor_destroy); - rcu_read_lock(); - } - rcu_read_unlock(); + drbd_free_sock(mdev); + + __no_warn(local, + drbd_free_bc(mdev->ldev); + mdev->ldev = NULL;); } +/* meta data management */ + struct meta_data_on_disk { u64 la_size; /* last agreed size. */ u64 uuid[UI_SIZE]; /* UUIDs. */ @@ -2847,7 +3833,7 @@ struct meta_data_on_disk { u32 md_size_sect; u32 al_offset; /* offset to this block */ u32 al_nr_extents; /* important for restoring the AL */ - /* `-- act_log->nr_elements <-- ldev->dc.al_extents */ + /* `-- act_log->nr_elements <-- sync_conf.al_extents */ u32 bm_offset; /* offset to the bitmap, from here */ u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */ u32 la_peer_max_bio_size; /* last peer max_bio_size */ @@ -2885,7 +3871,7 @@ void drbd_md_sync(struct drbd_conf *mdev) for (i = UI_CURRENT; i < UI_SIZE; i++) buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]); buffer->flags = cpu_to_be32(mdev->ldev->md.flags); - buffer->magic = cpu_to_be32(DRBD_MD_MAGIC_84_UNCLEAN); + buffer->magic = cpu_to_be32(DRBD_MD_MAGIC); buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect); buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset); @@ -2899,7 +3885,7 @@ void drbd_md_sync(struct drbd_conf *mdev) D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset); sector = mdev->ldev->md.md_offset; - if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) { + if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) { /* this was a try anyways ... */ dev_err(DEV, "meta data update failed!\n"); drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR); @@ -2920,12 +3906,11 @@ void drbd_md_sync(struct drbd_conf *mdev) * @bdev: Device from which the meta data should be read in. * * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case - * something goes wrong. + * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID. */ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) { struct meta_data_on_disk *buffer; - u32 magic, flags; int i, rv = NO_ERROR; if (!get_ldev_if_state(mdev, D_ATTACHING)) @@ -2935,7 +3920,7 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) if (!buffer) goto out; - if (drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) { + if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) { /* NOTE: can't do normal error processing here as this is called BEFORE disk is attached */ dev_err(DEV, "Error while reading metadata.\n"); @@ -2943,20 +3928,8 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) goto err; } - magic = be32_to_cpu(buffer->magic); - flags = be32_to_cpu(buffer->flags); - if (magic == DRBD_MD_MAGIC_84_UNCLEAN || - (magic == DRBD_MD_MAGIC_08 && !(flags & MDF_AL_CLEAN))) { - /* btw: that's Activity Log clean, not "all" clean. */ - dev_err(DEV, "Found unclean meta data. Did you \"drbdadm apply-al\"?\n"); - rv = ERR_MD_UNCLEAN; - goto err; - } - if (magic != DRBD_MD_MAGIC_08) { - if (magic == DRBD_MD_MAGIC_07) - dev_err(DEV, "Found old (0.7) meta data magic. Did you \"drbdadm create-md\"?\n"); - else - dev_err(DEV, "Meta data magic not found. Did you \"drbdadm create-md\"?\n"); + if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) { + dev_err(DEV, "Error while reading metadata, magic not found.\n"); rv = ERR_MD_INVALID; goto err; } @@ -2990,16 +3963,20 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) for (i = UI_CURRENT; i < UI_SIZE; i++) bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]); bdev->md.flags = be32_to_cpu(buffer->flags); + mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents); bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid); - spin_lock_irq(&mdev->tconn->req_lock); + spin_lock_irq(&mdev->req_lock); if (mdev->state.conn < C_CONNECTED) { unsigned int peer; peer = be32_to_cpu(buffer->la_peer_max_bio_size); peer = max(peer, DRBD_MAX_BIO_SIZE_SAFE); mdev->peer_max_bio_size = peer; } - spin_unlock_irq(&mdev->tconn->req_lock); + spin_unlock_irq(&mdev->req_lock); + + if (mdev->sync_conf.al_extents < 7) + mdev->sync_conf.al_extents = 127; err: drbd_md_put_buffer(mdev); @@ -3034,7 +4011,7 @@ void drbd_md_mark_dirty(struct drbd_conf *mdev) } #endif -void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local) +static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local) { int i; @@ -3042,7 +4019,7 @@ void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local) mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i]; } -void __drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local) +void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local) { if (idx == UI_CURRENT) { if (mdev->state.role == R_PRIMARY) @@ -3057,24 +4034,14 @@ void __drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local drbd_md_mark_dirty(mdev); } -void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local) -{ - unsigned long flags; - spin_lock_irqsave(&mdev->ldev->md.uuid_lock, flags); - __drbd_uuid_set(mdev, idx, val); - spin_unlock_irqrestore(&mdev->ldev->md.uuid_lock, flags); -} void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local) { - unsigned long flags; - spin_lock_irqsave(&mdev->ldev->md.uuid_lock, flags); if (mdev->ldev->md.uuid[idx]) { drbd_uuid_move_history(mdev); mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx]; } - __drbd_uuid_set(mdev, idx, val); - spin_unlock_irqrestore(&mdev->ldev->md.uuid_lock, flags); + _drbd_uuid_set(mdev, idx, val); } /** @@ -3087,20 +4054,15 @@ void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local) void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local) { u64 val; - unsigned long long bm_uuid; - - get_random_bytes(&val, sizeof(u64)); - - spin_lock_irq(&mdev->ldev->md.uuid_lock); - bm_uuid = mdev->ldev->md.uuid[UI_BITMAP]; + unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP]; if (bm_uuid) dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid); mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT]; - __drbd_uuid_set(mdev, UI_CURRENT, val); - spin_unlock_irq(&mdev->ldev->md.uuid_lock); + get_random_bytes(&val, sizeof(u64)); + _drbd_uuid_set(mdev, UI_CURRENT, val); drbd_print_uuids(mdev, "new current UUID"); /* get it to stable storage _now_ */ drbd_md_sync(mdev); @@ -3108,11 +4070,9 @@ void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local) void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local) { - unsigned long flags; if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0) return; - spin_lock_irqsave(&mdev->ldev->md.uuid_lock, flags); if (val == 0) { drbd_uuid_move_history(mdev); mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP]; @@ -3124,8 +4084,6 @@ void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local) mdev->ldev->md.uuid[UI_BITMAP] = val & ~((u64)1); } - spin_unlock_irqrestore(&mdev->ldev->md.uuid_lock, flags); - drbd_md_mark_dirty(mdev); } @@ -3177,10 +4135,9 @@ int drbd_bmio_clear_n_write(struct drbd_conf *mdev) return rv; } -static int w_bitmap_io(struct drbd_work *w, int unused) +static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused) { struct bm_io_work *work = container_of(w, struct bm_io_work, w); - struct drbd_conf *mdev = w->mdev; int rv = -EIO; D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0); @@ -3192,7 +4149,8 @@ static int w_bitmap_io(struct drbd_work *w, int unused) put_ldev(mdev); } - clear_bit_unlock(BITMAP_IO, &mdev->flags); + clear_bit(BITMAP_IO, &mdev->flags); + smp_mb__after_clear_bit(); wake_up(&mdev->misc_wait); if (work->done) @@ -3202,7 +4160,7 @@ static int w_bitmap_io(struct drbd_work *w, int unused) work->why = NULL; work->flags = 0; - return 0; + return 1; } void drbd_ldev_destroy(struct drbd_conf *mdev) @@ -3215,51 +4173,29 @@ void drbd_ldev_destroy(struct drbd_conf *mdev) drbd_free_bc(mdev->ldev); mdev->ldev = NULL;); + if (mdev->md_io_tmpp) { + __free_page(mdev->md_io_tmpp); + mdev->md_io_tmpp = NULL; + } clear_bit(GO_DISKLESS, &mdev->flags); } -static int w_go_diskless(struct drbd_work *w, int unused) +static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused) { - struct drbd_conf *mdev = w->mdev; - D_ASSERT(mdev->state.disk == D_FAILED); /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will * inc/dec it frequently. Once we are D_DISKLESS, no one will touch * the protected members anymore, though, so once put_ldev reaches zero * again, it will be safe to free them. */ - - /* Try to write changed bitmap pages, read errors may have just - * set some bits outside the area covered by the activity log. - * - * If we have an IO error during the bitmap writeout, - * we will want a full sync next time, just in case. - * (Do we want a specific meta data flag for this?) - * - * If that does not make it to stable storage either, - * we cannot do anything about that anymore. - * - * We still need to check if both bitmap and ldev are present, we may - * end up here after a failed attach, before ldev was even assigned. - */ - if (mdev->bitmap && mdev->ldev) { - if (drbd_bitmap_io_from_worker(mdev, drbd_bm_write, - "detach", BM_LOCKED_MASK)) { - if (test_bit(WAS_READ_ERROR, &mdev->flags)) { - drbd_md_set_flag(mdev, MDF_FULL_SYNC); - drbd_md_sync(mdev); - } - } - } - drbd_force_state(mdev, NS(disk, D_DISKLESS)); - return 0; + return 1; } void drbd_go_diskless(struct drbd_conf *mdev) { D_ASSERT(mdev->state.disk == D_FAILED); if (!test_and_set_bit(GO_DISKLESS, &mdev->flags)) - drbd_queue_work(&mdev->tconn->sender_work, &mdev->go_diskless); + drbd_queue_work(&mdev->data.work, &mdev->go_diskless); } /** @@ -3279,7 +4215,7 @@ void drbd_queue_bitmap_io(struct drbd_conf *mdev, void (*done)(struct drbd_conf *, int), char *why, enum bm_flag flags) { - D_ASSERT(current == mdev->tconn->worker.task); + D_ASSERT(current == mdev->worker.task); D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags)); D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags)); @@ -3293,13 +4229,13 @@ void drbd_queue_bitmap_io(struct drbd_conf *mdev, mdev->bm_io_work.why = why; mdev->bm_io_work.flags = flags; - spin_lock_irq(&mdev->tconn->req_lock); + spin_lock_irq(&mdev->req_lock); set_bit(BITMAP_IO, &mdev->flags); if (atomic_read(&mdev->ap_bio_cnt) == 0) { if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags)) - drbd_queue_work(&mdev->tconn->sender_work, &mdev->bm_io_work.w); + drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w); } - spin_unlock_irq(&mdev->tconn->req_lock); + spin_unlock_irq(&mdev->req_lock); } /** @@ -3316,7 +4252,7 @@ int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), { int rv; - D_ASSERT(current != mdev->tconn->worker.task); + D_ASSERT(current != mdev->worker.task); if ((flags & BM_LOCKED_SET_ALLOWED) == 0) drbd_suspend_io(mdev); @@ -3355,127 +4291,18 @@ static void md_sync_timer_fn(unsigned long data) { struct drbd_conf *mdev = (struct drbd_conf *) data; - /* must not double-queue! */ - if (list_empty(&mdev->md_sync_work.list)) - drbd_queue_work_front(&mdev->tconn->sender_work, &mdev->md_sync_work); + drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work); } -static int w_md_sync(struct drbd_work *w, int unused) +static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused) { - struct drbd_conf *mdev = w->mdev; - dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n"); #ifdef DEBUG dev_warn(DEV, "last md_mark_dirty: %s:%u\n", mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line); #endif drbd_md_sync(mdev); - return 0; -} - -const char *cmdname(enum drbd_packet cmd) -{ - /* THINK may need to become several global tables - * when we want to support more than - * one PRO_VERSION */ - static const char *cmdnames[] = { - [P_DATA] = "Data", - [P_DATA_REPLY] = "DataReply", - [P_RS_DATA_REPLY] = "RSDataReply", - [P_BARRIER] = "Barrier", - [P_BITMAP] = "ReportBitMap", - [P_BECOME_SYNC_TARGET] = "BecomeSyncTarget", - [P_BECOME_SYNC_SOURCE] = "BecomeSyncSource", - [P_UNPLUG_REMOTE] = "UnplugRemote", - [P_DATA_REQUEST] = "DataRequest", - [P_RS_DATA_REQUEST] = "RSDataRequest", - [P_SYNC_PARAM] = "SyncParam", - [P_SYNC_PARAM89] = "SyncParam89", - [P_PROTOCOL] = "ReportProtocol", - [P_UUIDS] = "ReportUUIDs", - [P_SIZES] = "ReportSizes", - [P_STATE] = "ReportState", - [P_SYNC_UUID] = "ReportSyncUUID", - [P_AUTH_CHALLENGE] = "AuthChallenge", - [P_AUTH_RESPONSE] = "AuthResponse", - [P_PING] = "Ping", - [P_PING_ACK] = "PingAck", - [P_RECV_ACK] = "RecvAck", - [P_WRITE_ACK] = "WriteAck", - [P_RS_WRITE_ACK] = "RSWriteAck", - [P_SUPERSEDED] = "Superseded", - [P_NEG_ACK] = "NegAck", - [P_NEG_DREPLY] = "NegDReply", - [P_NEG_RS_DREPLY] = "NegRSDReply", - [P_BARRIER_ACK] = "BarrierAck", - [P_STATE_CHG_REQ] = "StateChgRequest", - [P_STATE_CHG_REPLY] = "StateChgReply", - [P_OV_REQUEST] = "OVRequest", - [P_OV_REPLY] = "OVReply", - [P_OV_RESULT] = "OVResult", - [P_CSUM_RS_REQUEST] = "CsumRSRequest", - [P_RS_IS_IN_SYNC] = "CsumRSIsInSync", - [P_COMPRESSED_BITMAP] = "CBitmap", - [P_DELAY_PROBE] = "DelayProbe", - [P_OUT_OF_SYNC] = "OutOfSync", - [P_RETRY_WRITE] = "RetryWrite", - [P_RS_CANCEL] = "RSCancel", - [P_CONN_ST_CHG_REQ] = "conn_st_chg_req", - [P_CONN_ST_CHG_REPLY] = "conn_st_chg_reply", - [P_RETRY_WRITE] = "retry_write", - [P_PROTOCOL_UPDATE] = "protocol_update", - - /* enum drbd_packet, but not commands - obsoleted flags: - * P_MAY_IGNORE - * P_MAX_OPT_CMD - */ - }; - - /* too big for the array: 0xfffX */ - if (cmd == P_INITIAL_META) - return "InitialMeta"; - if (cmd == P_INITIAL_DATA) - return "InitialData"; - if (cmd == P_CONNECTION_FEATURES) - return "ConnectionFeatures"; - if (cmd >= ARRAY_SIZE(cmdnames)) - return "Unknown"; - return cmdnames[cmd]; -} - -/** - * drbd_wait_misc - wait for a request to make progress - * @mdev: device associated with the request - * @i: the struct drbd_interval embedded in struct drbd_request or - * struct drbd_peer_request - */ -int drbd_wait_misc(struct drbd_conf *mdev, struct drbd_interval *i) -{ - struct net_conf *nc; - DEFINE_WAIT(wait); - long timeout; - - rcu_read_lock(); - nc = rcu_dereference(mdev->tconn->net_conf); - if (!nc) { - rcu_read_unlock(); - return -ETIMEDOUT; - } - timeout = nc->ko_count ? nc->timeout * HZ / 10 * nc->ko_count : MAX_SCHEDULE_TIMEOUT; - rcu_read_unlock(); - - /* Indicate to wake up mdev->misc_wait on progress. */ - i->waiting = true; - prepare_to_wait(&mdev->misc_wait, &wait, TASK_INTERRUPTIBLE); - spin_unlock_irq(&mdev->tconn->req_lock); - timeout = schedule_timeout(timeout); - finish_wait(&mdev->misc_wait, &wait); - spin_lock_irq(&mdev->tconn->req_lock); - if (!timeout || mdev->state.conn < C_CONNECTED) - return -ETIMEDOUT; - if (signal_pending(current)) - return -ERESTARTSYS; - return 0; + return 1; } #ifdef CONFIG_DRBD_FAULT_INJECTION diff --git a/trunk/drivers/block/drbd/drbd_nl.c b/trunk/drivers/block/drbd/drbd_nl.c index 2af26fc95280..edb490aad8b4 100644 --- a/trunk/drivers/block/drbd/drbd_nl.c +++ b/trunk/drivers/block/drbd/drbd_nl.c @@ -29,317 +29,159 @@ #include #include #include +#include #include #include #include "drbd_int.h" #include "drbd_req.h" #include "drbd_wrappers.h" #include +#include #include +#include #include -#include - -/* .doit */ -// int drbd_adm_create_resource(struct sk_buff *skb, struct genl_info *info); -// int drbd_adm_delete_resource(struct sk_buff *skb, struct genl_info *info); - -int drbd_adm_add_minor(struct sk_buff *skb, struct genl_info *info); -int drbd_adm_delete_minor(struct sk_buff *skb, struct genl_info *info); - -int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info); -int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info); -int drbd_adm_down(struct sk_buff *skb, struct genl_info *info); - -int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info); -int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info); -int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info); -int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info); -int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info); -int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info); -int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info); -int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info); -int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info); -int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info); -int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info); -int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info); -int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info); -int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info); -int drbd_adm_suspend_io(struct sk_buff *skb, struct genl_info *info); -int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info); -int drbd_adm_outdate(struct sk_buff *skb, struct genl_info *info); -int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info); -int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info); -int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info); -/* .dumpit */ -int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb); - -#include -#include "drbd_nla.h" -#include - -/* used blkdev_get_by_path, to claim our meta data device(s) */ -static char *drbd_m_holder = "Hands off! this is DRBD's meta data device."; - -/* Configuration is strictly serialized, because generic netlink message - * processing is strictly serialized by the genl_lock(). - * Which means we can use one static global drbd_config_context struct. - */ -static struct drbd_config_context { - /* assigned from drbd_genlmsghdr */ - unsigned int minor; - /* assigned from request attributes, if present */ - unsigned int volume; -#define VOLUME_UNSPECIFIED (-1U) - /* pointer into the request skb, - * limited lifetime! */ - char *resource_name; - struct nlattr *my_addr; - struct nlattr *peer_addr; - - /* reply buffer */ - struct sk_buff *reply_skb; - /* pointer into reply buffer */ - struct drbd_genlmsghdr *reply_dh; - /* resolved from attributes, if possible */ - struct drbd_conf *mdev; - struct drbd_tconn *tconn; -} adm_ctx; - -static void drbd_adm_send_reply(struct sk_buff *skb, struct genl_info *info) -{ - genlmsg_end(skb, genlmsg_data(nlmsg_data(nlmsg_hdr(skb)))); - if (genlmsg_reply(skb, info)) - printk(KERN_ERR "drbd: error sending genl reply\n"); -} - -/* Used on a fresh "drbd_adm_prepare"d reply_skb, this cannot fail: The only - * reason it could fail was no space in skb, and there are 4k available. */ -int drbd_msg_put_info(const char *info) -{ - struct sk_buff *skb = adm_ctx.reply_skb; - struct nlattr *nla; - int err = -EMSGSIZE; - - if (!info || !info[0]) - return 0; - - nla = nla_nest_start(skb, DRBD_NLA_CFG_REPLY); - if (!nla) - return err; - - err = nla_put_string(skb, T_info_text, info); - if (err) { - nla_nest_cancel(skb, nla); - return err; - } else - nla_nest_end(skb, nla); - return 0; -} - -/* This would be a good candidate for a "pre_doit" hook, - * and per-family private info->pointers. - * But we need to stay compatible with older kernels. - * If it returns successfully, adm_ctx members are valid. - */ -#define DRBD_ADM_NEED_MINOR 1 -#define DRBD_ADM_NEED_RESOURCE 2 -#define DRBD_ADM_NEED_CONNECTION 4 -static int drbd_adm_prepare(struct sk_buff *skb, struct genl_info *info, - unsigned flags) -{ - struct drbd_genlmsghdr *d_in = info->userhdr; - const u8 cmd = info->genlhdr->cmd; - int err; - - memset(&adm_ctx, 0, sizeof(adm_ctx)); - - /* genl_rcv_msg only checks for CAP_NET_ADMIN on "GENL_ADMIN_PERM" :( */ - if (cmd != DRBD_ADM_GET_STATUS && !capable(CAP_NET_ADMIN)) - return -EPERM; - - adm_ctx.reply_skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); - if (!adm_ctx.reply_skb) { - err = -ENOMEM; - goto fail; - } - - adm_ctx.reply_dh = genlmsg_put_reply(adm_ctx.reply_skb, - info, &drbd_genl_family, 0, cmd); - /* put of a few bytes into a fresh skb of >= 4k will always succeed. - * but anyways */ - if (!adm_ctx.reply_dh) { - err = -ENOMEM; - goto fail; - } - - adm_ctx.reply_dh->minor = d_in->minor; - adm_ctx.reply_dh->ret_code = NO_ERROR; - - adm_ctx.volume = VOLUME_UNSPECIFIED; - if (info->attrs[DRBD_NLA_CFG_CONTEXT]) { - struct nlattr *nla; - /* parse and validate only */ - err = drbd_cfg_context_from_attrs(NULL, info); - if (err) - goto fail; - - /* It was present, and valid, - * copy it over to the reply skb. */ - err = nla_put_nohdr(adm_ctx.reply_skb, - info->attrs[DRBD_NLA_CFG_CONTEXT]->nla_len, - info->attrs[DRBD_NLA_CFG_CONTEXT]); - if (err) - goto fail; - - /* and assign stuff to the global adm_ctx */ - nla = nested_attr_tb[__nla_type(T_ctx_volume)]; - if (nla) - adm_ctx.volume = nla_get_u32(nla); - nla = nested_attr_tb[__nla_type(T_ctx_resource_name)]; - if (nla) - adm_ctx.resource_name = nla_data(nla); - adm_ctx.my_addr = nested_attr_tb[__nla_type(T_ctx_my_addr)]; - adm_ctx.peer_addr = nested_attr_tb[__nla_type(T_ctx_peer_addr)]; - if ((adm_ctx.my_addr && - nla_len(adm_ctx.my_addr) > sizeof(adm_ctx.tconn->my_addr)) || - (adm_ctx.peer_addr && - nla_len(adm_ctx.peer_addr) > sizeof(adm_ctx.tconn->peer_addr))) { - err = -EINVAL; - goto fail; - } - } - - adm_ctx.minor = d_in->minor; - adm_ctx.mdev = minor_to_mdev(d_in->minor); - adm_ctx.tconn = conn_get_by_name(adm_ctx.resource_name); - - if (!adm_ctx.mdev && (flags & DRBD_ADM_NEED_MINOR)) { - drbd_msg_put_info("unknown minor"); - return ERR_MINOR_INVALID; - } - if (!adm_ctx.tconn && (flags & DRBD_ADM_NEED_RESOURCE)) { - drbd_msg_put_info("unknown resource"); - return ERR_INVALID_REQUEST; - } - - if (flags & DRBD_ADM_NEED_CONNECTION) { - if (adm_ctx.tconn && !(flags & DRBD_ADM_NEED_RESOURCE)) { - drbd_msg_put_info("no resource name expected"); - return ERR_INVALID_REQUEST; - } - if (adm_ctx.mdev) { - drbd_msg_put_info("no minor number expected"); - return ERR_INVALID_REQUEST; - } - if (adm_ctx.my_addr && adm_ctx.peer_addr) - adm_ctx.tconn = conn_get_by_addrs(nla_data(adm_ctx.my_addr), - nla_len(adm_ctx.my_addr), - nla_data(adm_ctx.peer_addr), - nla_len(adm_ctx.peer_addr)); - if (!adm_ctx.tconn) { - drbd_msg_put_info("unknown connection"); - return ERR_INVALID_REQUEST; - } - } - - /* some more paranoia, if the request was over-determined */ - if (adm_ctx.mdev && adm_ctx.tconn && - adm_ctx.mdev->tconn != adm_ctx.tconn) { - pr_warning("request: minor=%u, resource=%s; but that minor belongs to connection %s\n", - adm_ctx.minor, adm_ctx.resource_name, - adm_ctx.mdev->tconn->name); - drbd_msg_put_info("minor exists in different resource"); - return ERR_INVALID_REQUEST; - } - if (adm_ctx.mdev && - adm_ctx.volume != VOLUME_UNSPECIFIED && - adm_ctx.volume != adm_ctx.mdev->vnr) { - pr_warning("request: minor=%u, volume=%u; but that minor is volume %u in %s\n", - adm_ctx.minor, adm_ctx.volume, - adm_ctx.mdev->vnr, adm_ctx.mdev->tconn->name); - drbd_msg_put_info("minor exists as different volume"); - return ERR_INVALID_REQUEST; - } - - return NO_ERROR; - -fail: - nlmsg_free(adm_ctx.reply_skb); - adm_ctx.reply_skb = NULL; - return err; -} - -static int drbd_adm_finish(struct genl_info *info, int retcode) -{ - if (adm_ctx.tconn) { - kref_put(&adm_ctx.tconn->kref, &conn_destroy); - adm_ctx.tconn = NULL; - } - - if (!adm_ctx.reply_skb) - return -ENOMEM; - - adm_ctx.reply_dh->ret_code = retcode; - drbd_adm_send_reply(adm_ctx.reply_skb, info); - return 0; -} +static unsigned short *tl_add_blob(unsigned short *, enum drbd_tags, const void *, int); +static unsigned short *tl_add_str(unsigned short *, enum drbd_tags, const char *); +static unsigned short *tl_add_int(unsigned short *, enum drbd_tags, const void *); -static void setup_khelper_env(struct drbd_tconn *tconn, char **envp) -{ - char *afs; - - /* FIXME: A future version will not allow this case. */ - if (tconn->my_addr_len == 0 || tconn->peer_addr_len == 0) - return; +/* see get_sb_bdev and bd_claim */ +static char *drbd_m_holder = "Hands off! this is DRBD's meta data device."; - switch (((struct sockaddr *)&tconn->peer_addr)->sa_family) { - case AF_INET6: - afs = "ipv6"; - snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI6", - &((struct sockaddr_in6 *)&tconn->peer_addr)->sin6_addr); +/* Generate the tag_list to struct functions */ +#define NL_PACKET(name, number, fields) \ +static int name ## _from_tags(struct drbd_conf *mdev, \ + unsigned short *tags, struct name *arg) __attribute__ ((unused)); \ +static int name ## _from_tags(struct drbd_conf *mdev, \ + unsigned short *tags, struct name *arg) \ +{ \ + int tag; \ + int dlen; \ + \ + while ((tag = get_unaligned(tags++)) != TT_END) { \ + dlen = get_unaligned(tags++); \ + switch (tag_number(tag)) { \ + fields \ + default: \ + if (tag & T_MANDATORY) { \ + dev_err(DEV, "Unknown tag: %d\n", tag_number(tag)); \ + return 0; \ + } \ + } \ + tags = (unsigned short *)((char *)tags + dlen); \ + } \ + return 1; \ +} +#define NL_INTEGER(pn, pr, member) \ + case pn: /* D_ASSERT( tag_type(tag) == TT_INTEGER ); */ \ + arg->member = get_unaligned((int *)(tags)); \ break; - case AF_INET: - afs = "ipv4"; - snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI4", - &((struct sockaddr_in *)&tconn->peer_addr)->sin_addr); +#define NL_INT64(pn, pr, member) \ + case pn: /* D_ASSERT( tag_type(tag) == TT_INT64 ); */ \ + arg->member = get_unaligned((u64 *)(tags)); \ break; - default: - afs = "ssocks"; - snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI4", - &((struct sockaddr_in *)&tconn->peer_addr)->sin_addr); - } - snprintf(envp[3], 20, "DRBD_PEER_AF=%s", afs); -} +#define NL_BIT(pn, pr, member) \ + case pn: /* D_ASSERT( tag_type(tag) == TT_BIT ); */ \ + arg->member = *(char *)(tags) ? 1 : 0; \ + break; +#define NL_STRING(pn, pr, member, len) \ + case pn: /* D_ASSERT( tag_type(tag) == TT_STRING ); */ \ + if (dlen > len) { \ + dev_err(DEV, "arg too long: %s (%u wanted, max len: %u bytes)\n", \ + #member, dlen, (unsigned int)len); \ + return 0; \ + } \ + arg->member ## _len = dlen; \ + memcpy(arg->member, tags, min_t(size_t, dlen, len)); \ + break; +#include + +/* Generate the struct to tag_list functions */ +#define NL_PACKET(name, number, fields) \ +static unsigned short* \ +name ## _to_tags(struct drbd_conf *mdev, \ + struct name *arg, unsigned short *tags) __attribute__ ((unused)); \ +static unsigned short* \ +name ## _to_tags(struct drbd_conf *mdev, \ + struct name *arg, unsigned short *tags) \ +{ \ + fields \ + return tags; \ +} + +#define NL_INTEGER(pn, pr, member) \ + put_unaligned(pn | pr | TT_INTEGER, tags++); \ + put_unaligned(sizeof(int), tags++); \ + put_unaligned(arg->member, (int *)tags); \ + tags = (unsigned short *)((char *)tags+sizeof(int)); +#define NL_INT64(pn, pr, member) \ + put_unaligned(pn | pr | TT_INT64, tags++); \ + put_unaligned(sizeof(u64), tags++); \ + put_unaligned(arg->member, (u64 *)tags); \ + tags = (unsigned short *)((char *)tags+sizeof(u64)); +#define NL_BIT(pn, pr, member) \ + put_unaligned(pn | pr | TT_BIT, tags++); \ + put_unaligned(sizeof(char), tags++); \ + *(char *)tags = arg->member; \ + tags = (unsigned short *)((char *)tags+sizeof(char)); +#define NL_STRING(pn, pr, member, len) \ + put_unaligned(pn | pr | TT_STRING, tags++); \ + put_unaligned(arg->member ## _len, tags++); \ + memcpy(tags, arg->member, arg->member ## _len); \ + tags = (unsigned short *)((char *)tags + arg->member ## _len); +#include + +void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name); +void drbd_nl_send_reply(struct cn_msg *, int); int drbd_khelper(struct drbd_conf *mdev, char *cmd) { char *envp[] = { "HOME=/", "TERM=linux", "PATH=/sbin:/usr/sbin:/bin:/usr/bin", - (char[20]) { }, /* address family */ - (char[60]) { }, /* address */ + NULL, /* Will be set to address family */ + NULL, /* Will be set to address */ NULL }; - char mb[12]; + + char mb[12], af[20], ad[60], *afs; char *argv[] = {usermode_helper, cmd, mb, NULL }; - struct drbd_tconn *tconn = mdev->tconn; - struct sib_info sib; int ret; - if (current == tconn->worker.task) - set_bit(CALLBACK_PENDING, &tconn->flags); + if (current == mdev->worker.task) + set_bit(CALLBACK_PENDING, &mdev->flags); snprintf(mb, 12, "minor-%d", mdev_to_minor(mdev)); - setup_khelper_env(tconn, envp); + + if (get_net_conf(mdev)) { + switch (((struct sockaddr *)mdev->net_conf->peer_addr)->sa_family) { + case AF_INET6: + afs = "ipv6"; + snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI6", + &((struct sockaddr_in6 *)mdev->net_conf->peer_addr)->sin6_addr); + break; + case AF_INET: + afs = "ipv4"; + snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI4", + &((struct sockaddr_in *)mdev->net_conf->peer_addr)->sin_addr); + break; + default: + afs = "ssocks"; + snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI4", + &((struct sockaddr_in *)mdev->net_conf->peer_addr)->sin_addr); + } + snprintf(af, 20, "DRBD_PEER_AF=%s", afs); + envp[3]=af; + envp[4]=ad; + put_net_conf(mdev); + } /* The helper may take some time. * write out any unsynced meta data changes now */ drbd_md_sync(mdev); dev_info(DEV, "helper command: %s %s %s\n", usermode_helper, cmd, mb); - sib.sib_reason = SIB_HELPER_PRE; - sib.helper_name = cmd; - drbd_bcast_event(mdev, &sib); + + drbd_bcast_ev_helper(mdev, cmd); ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC); if (ret) dev_warn(DEV, "helper command: %s %s %s exit code %u (0x%x)\n", @@ -349,46 +191,9 @@ int drbd_khelper(struct drbd_conf *mdev, char *cmd) dev_info(DEV, "helper command: %s %s %s exit code %u (0x%x)\n", usermode_helper, cmd, mb, (ret >> 8) & 0xff, ret); - sib.sib_reason = SIB_HELPER_POST; - sib.helper_exit_code = ret; - drbd_bcast_event(mdev, &sib); - - if (current == tconn->worker.task) - clear_bit(CALLBACK_PENDING, &tconn->flags); - if (ret < 0) /* Ignore any ERRNOs we got. */ - ret = 0; - - return ret; -} - -int conn_khelper(struct drbd_tconn *tconn, char *cmd) -{ - char *envp[] = { "HOME=/", - "TERM=linux", - "PATH=/sbin:/usr/sbin:/bin:/usr/bin", - (char[20]) { }, /* address family */ - (char[60]) { }, /* address */ - NULL }; - char *argv[] = {usermode_helper, cmd, tconn->name, NULL }; - int ret; - - setup_khelper_env(tconn, envp); - conn_md_sync(tconn); - - conn_info(tconn, "helper command: %s %s %s\n", usermode_helper, cmd, tconn->name); - /* TODO: conn_bcast_event() ?? */ - - ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC); - if (ret) - conn_warn(tconn, "helper command: %s %s %s exit code %u (0x%x)\n", - usermode_helper, cmd, tconn->name, - (ret >> 8) & 0xff, ret); - else - conn_info(tconn, "helper command: %s %s %s exit code %u (0x%x)\n", - usermode_helper, cmd, tconn->name, - (ret >> 8) & 0xff, ret); - /* TODO: conn_bcast_event() ?? */ + if (current == mdev->worker.task) + clear_bit(CALLBACK_PENDING, &mdev->flags); if (ret < 0) /* Ignore any ERRNOs we got. */ ret = 0; @@ -396,129 +201,116 @@ int conn_khelper(struct drbd_tconn *tconn, char *cmd) return ret; } -static enum drbd_fencing_p highest_fencing_policy(struct drbd_tconn *tconn) -{ - enum drbd_fencing_p fp = FP_NOT_AVAIL; - struct drbd_conf *mdev; - int vnr; - - rcu_read_lock(); - idr_for_each_entry(&tconn->volumes, mdev, vnr) { - if (get_ldev_if_state(mdev, D_CONSISTENT)) { - fp = max_t(enum drbd_fencing_p, fp, - rcu_dereference(mdev->ldev->disk_conf)->fencing); - put_ldev(mdev); - } - } - rcu_read_unlock(); - - return fp; -} - -bool conn_try_outdate_peer(struct drbd_tconn *tconn) +enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev) { - union drbd_state mask = { }; - union drbd_state val = { }; - enum drbd_fencing_p fp; char *ex_to_string; int r; + enum drbd_disk_state nps; + enum drbd_fencing_p fp; - if (tconn->cstate >= C_WF_REPORT_PARAMS) { - conn_err(tconn, "Expected cstate < C_WF_REPORT_PARAMS\n"); - return false; - } + D_ASSERT(mdev->state.pdsk == D_UNKNOWN); - fp = highest_fencing_policy(tconn); - switch (fp) { - case FP_NOT_AVAIL: - conn_warn(tconn, "Not fencing peer, I'm not even Consistent myself.\n"); + if (get_ldev_if_state(mdev, D_CONSISTENT)) { + fp = mdev->ldev->dc.fencing; + put_ldev(mdev); + } else { + dev_warn(DEV, "Not fencing peer, I'm not even Consistent myself.\n"); + nps = mdev->state.pdsk; goto out; - case FP_DONT_CARE: - return true; - default: ; } - r = conn_khelper(tconn, "fence-peer"); + r = drbd_khelper(mdev, "fence-peer"); switch ((r>>8) & 0xff) { case 3: /* peer is inconsistent */ ex_to_string = "peer is inconsistent or worse"; - mask.pdsk = D_MASK; - val.pdsk = D_INCONSISTENT; + nps = D_INCONSISTENT; break; case 4: /* peer got outdated, or was already outdated */ ex_to_string = "peer was fenced"; - mask.pdsk = D_MASK; - val.pdsk = D_OUTDATED; + nps = D_OUTDATED; break; case 5: /* peer was down */ - if (conn_highest_disk(tconn) == D_UP_TO_DATE) { + if (mdev->state.disk == D_UP_TO_DATE) { /* we will(have) create(d) a new UUID anyways... */ ex_to_string = "peer is unreachable, assumed to be dead"; - mask.pdsk = D_MASK; - val.pdsk = D_OUTDATED; + nps = D_OUTDATED; } else { ex_to_string = "peer unreachable, doing nothing since disk != UpToDate"; + nps = mdev->state.pdsk; } break; case 6: /* Peer is primary, voluntarily outdate myself. * This is useful when an unconnected R_SECONDARY is asked to * become R_PRIMARY, but finds the other peer being active. */ ex_to_string = "peer is active"; - conn_warn(tconn, "Peer is primary, outdating myself.\n"); - mask.disk = D_MASK; - val.disk = D_OUTDATED; + dev_warn(DEV, "Peer is primary, outdating myself.\n"); + nps = D_UNKNOWN; + _drbd_request_state(mdev, NS(disk, D_OUTDATED), CS_WAIT_COMPLETE); break; case 7: if (fp != FP_STONITH) - conn_err(tconn, "fence-peer() = 7 && fencing != Stonith !!!\n"); + dev_err(DEV, "fence-peer() = 7 && fencing != Stonith !!!\n"); ex_to_string = "peer was stonithed"; - mask.pdsk = D_MASK; - val.pdsk = D_OUTDATED; + nps = D_OUTDATED; break; default: /* The script is broken ... */ - conn_err(tconn, "fence-peer helper broken, returned %d\n", (r>>8)&0xff); - return false; /* Eventually leave IO frozen */ + nps = D_UNKNOWN; + dev_err(DEV, "fence-peer helper broken, returned %d\n", (r>>8)&0xff); + return nps; } - conn_info(tconn, "fence-peer helper returned %d (%s)\n", - (r>>8) & 0xff, ex_to_string); + dev_info(DEV, "fence-peer helper returned %d (%s)\n", + (r>>8) & 0xff, ex_to_string); - out: +out: + if (mdev->state.susp_fen && nps >= D_UNKNOWN) { + /* The handler was not successful... unfreeze here, the + state engine can not unfreeze... */ + _drbd_request_state(mdev, NS(susp_fen, 0), CS_VERBOSE); + } - /* Not using - conn_request_state(tconn, mask, val, CS_VERBOSE); - here, because we might were able to re-establish the connection in the - meantime. */ - spin_lock_irq(&tconn->req_lock); - if (tconn->cstate < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &tconn->flags)) - _conn_request_state(tconn, mask, val, CS_VERBOSE); - spin_unlock_irq(&tconn->req_lock); - - return conn_highest_pdsk(tconn) <= D_OUTDATED; + return nps; } static int _try_outdate_peer_async(void *data) { - struct drbd_tconn *tconn = (struct drbd_tconn *)data; + struct drbd_conf *mdev = (struct drbd_conf *)data; + enum drbd_disk_state nps; + union drbd_state ns; - conn_try_outdate_peer(tconn); + nps = drbd_try_outdate_peer(mdev); + + /* Not using + drbd_request_state(mdev, NS(pdsk, nps)); + here, because we might were able to re-establish the connection + in the meantime. This can only partially be solved in the state's + engine is_valid_state() and is_valid_state_transition() + functions. + + nps can be D_INCONSISTENT, D_OUTDATED or D_UNKNOWN. + pdsk == D_INCONSISTENT while conn >= C_CONNECTED is valid, + therefore we have to have the pre state change check here. + */ + spin_lock_irq(&mdev->req_lock); + ns = mdev->state; + if (ns.conn < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &mdev->flags)) { + ns.pdsk = nps; + _drbd_set_state(mdev, ns, CS_VERBOSE, NULL); + } + spin_unlock_irq(&mdev->req_lock); - kref_put(&tconn->kref, &conn_destroy); return 0; } -void conn_try_outdate_peer_async(struct drbd_tconn *tconn) +void drbd_try_outdate_peer_async(struct drbd_conf *mdev) { struct task_struct *opa; - kref_get(&tconn->kref); - opa = kthread_run(_try_outdate_peer_async, tconn, "drbd_async_h"); - if (IS_ERR(opa)) { - conn_err(tconn, "out of mem, failed to invoke fence-peer helper\n"); - kref_put(&tconn->kref, &conn_destroy); - } + opa = kthread_run(_try_outdate_peer_async, mdev, "drbd%d_a_helper", mdev_to_minor(mdev)); + if (IS_ERR(opa)) + dev_err(DEV, "out of mem, failed to invoke fence-peer helper\n"); } enum drbd_state_rv @@ -526,15 +318,15 @@ drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force) { const int max_tries = 4; enum drbd_state_rv rv = SS_UNKNOWN_ERROR; - struct net_conf *nc; int try = 0; int forced = 0; union drbd_state mask, val; + enum drbd_disk_state nps; if (new_role == R_PRIMARY) - request_ping(mdev->tconn); /* Detect a dead peer ASAP */ + request_ping(mdev); /* Detect a dead peer ASAP */ - mutex_lock(mdev->state_mutex); + mutex_lock(&mdev->state_mutex); mask.i = 0; mask.role = R_MASK; val.i = 0; val.role = new_role; @@ -562,34 +354,38 @@ drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force) if (rv == SS_NO_UP_TO_DATE_DISK && mdev->state.disk == D_CONSISTENT && mask.pdsk == 0) { D_ASSERT(mdev->state.pdsk == D_UNKNOWN); + nps = drbd_try_outdate_peer(mdev); - if (conn_try_outdate_peer(mdev->tconn)) { + if (nps == D_OUTDATED || nps == D_INCONSISTENT) { val.disk = D_UP_TO_DATE; mask.disk = D_MASK; } + + val.pdsk = nps; + mask.pdsk = D_MASK; + continue; } if (rv == SS_NOTHING_TO_DO) - goto out; + goto fail; if (rv == SS_PRIMARY_NOP && mask.pdsk == 0) { - if (!conn_try_outdate_peer(mdev->tconn) && force) { - dev_warn(DEV, "Forced into split brain situation!\n"); - mask.pdsk = D_MASK; - val.pdsk = D_OUTDATED; + nps = drbd_try_outdate_peer(mdev); + if (force && nps > D_OUTDATED) { + dev_warn(DEV, "Forced into split brain situation!\n"); + nps = D_OUTDATED; } + + mask.pdsk = D_MASK; + val.pdsk = nps; + continue; } if (rv == SS_TWO_PRIMARIES) { /* Maybe the peer is detected as dead very soon... retry at most once more in this case. */ - int timeo; - rcu_read_lock(); - nc = rcu_dereference(mdev->tconn->net_conf); - timeo = nc ? (nc->ping_timeo + 1) * HZ / 10 : 1; - rcu_read_unlock(); - schedule_timeout_interruptible(timeo); + schedule_timeout_interruptible((mdev->net_conf->ping_timeo+1)*HZ/10); if (try < max_tries) try = max_tries - 1; continue; @@ -598,13 +394,13 @@ drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force) rv = _drbd_request_state(mdev, mask, val, CS_VERBOSE + CS_WAIT_COMPLETE); if (rv < SS_SUCCESS) - goto out; + goto fail; } break; } if (rv < SS_SUCCESS) - goto out; + goto fail; if (forced) dev_warn(DEV, "Forced to consider local data as UpToDate!\n"); @@ -612,8 +408,6 @@ drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force) /* Wait until nothing is on the fly :) */ wait_event(mdev->misc_wait, atomic_read(&mdev->ap_pending_cnt) == 0); - /* FIXME also wait for all pending P_BARRIER_ACK? */ - if (new_role == R_SECONDARY) { set_disk_ro(mdev->vdisk, true); if (get_ldev(mdev)) { @@ -621,12 +415,10 @@ drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force) put_ldev(mdev); } } else { - mutex_lock(&mdev->tconn->conf_update); - nc = mdev->tconn->net_conf; - if (nc) - nc->discard_my_data = 0; /* without copy; single bit op is atomic */ - mutex_unlock(&mdev->tconn->conf_update); - + if (get_net_conf(mdev)) { + mdev->net_conf->want_lose = 0; + put_net_conf(mdev); + } set_disk_ro(mdev->vdisk, false); if (get_ldev(mdev)) { if (((mdev->state.conn < C_CONNECTED || @@ -652,47 +444,67 @@ drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force) drbd_md_sync(mdev); kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); -out: - mutex_unlock(mdev->state_mutex); + fail: + mutex_unlock(&mdev->state_mutex); return rv; } -static const char *from_attrs_err_to_txt(int err) +static struct drbd_conf *ensure_mdev(int minor, int create) { - return err == -ENOMSG ? "required attribute missing" : - err == -EOPNOTSUPP ? "unknown mandatory attribute" : - err == -EEXIST ? "can not change invariant setting" : - "invalid attribute value"; + struct drbd_conf *mdev; + + if (minor >= minor_count) + return NULL; + + mdev = minor_to_mdev(minor); + + if (!mdev && create) { + struct gendisk *disk = NULL; + mdev = drbd_new_device(minor); + + spin_lock_irq(&drbd_pp_lock); + if (minor_table[minor] == NULL) { + minor_table[minor] = mdev; + disk = mdev->vdisk; + mdev = NULL; + } /* else: we lost the race */ + spin_unlock_irq(&drbd_pp_lock); + + if (disk) /* we won the race above */ + /* in case we ever add a drbd_delete_device(), + * don't forget the del_gendisk! */ + add_disk(disk); + else /* we lost the race above */ + drbd_free_mdev(mdev); + + mdev = minor_to_mdev(minor); + } + + return mdev; } -int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info) +static int drbd_nl_primary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) { - struct set_role_parms parms; - int err; - enum drbd_ret_code retcode; - - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); - if (!adm_ctx.reply_skb) - return retcode; - if (retcode != NO_ERROR) - goto out; + struct primary primary_args; - memset(&parms, 0, sizeof(parms)); - if (info->attrs[DRBD_NLA_SET_ROLE_PARMS]) { - err = set_role_parms_from_attrs(&parms, info); - if (err) { - retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); - goto out; - } + memset(&primary_args, 0, sizeof(struct primary)); + if (!primary_from_tags(mdev, nlp->tag_list, &primary_args)) { + reply->ret_code = ERR_MANDATORY_TAG; + return 0; } - if (info->genlhdr->cmd == DRBD_ADM_PRIMARY) - retcode = drbd_set_role(adm_ctx.mdev, R_PRIMARY, parms.assume_uptodate); - else - retcode = drbd_set_role(adm_ctx.mdev, R_SECONDARY, 0); -out: - drbd_adm_finish(info, retcode); + reply->ret_code = + drbd_set_role(mdev, R_PRIMARY, primary_args.primary_force); + + return 0; +} + +static int drbd_nl_secondary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + reply->ret_code = drbd_set_role(mdev, R_SECONDARY, 0); + return 0; } @@ -702,12 +514,7 @@ static void drbd_md_set_sector_offsets(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) { sector_t md_size_sect = 0; - int meta_dev_idx; - - rcu_read_lock(); - meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx; - - switch (meta_dev_idx) { + switch (bdev->dc.meta_dev_idx) { default: /* v07 style fixed size indexed meta data */ bdev->md.md_size_sect = MD_RESERVED_SECT; @@ -726,7 +533,7 @@ static void drbd_md_set_sector_offsets(struct drbd_conf *mdev, case DRBD_MD_INDEX_FLEX_INT: bdev->md.md_offset = drbd_md_ss__(mdev, bdev); /* al size is still fixed */ - bdev->md.al_offset = -MD_AL_SECTORS; + bdev->md.al_offset = -MD_AL_MAX_SIZE; /* we need (slightly less than) ~ this much bitmap sectors: */ md_size_sect = drbd_get_capacity(bdev->backing_bdev); md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT); @@ -742,7 +549,6 @@ static void drbd_md_set_sector_offsets(struct drbd_conf *mdev, bdev->md.bm_offset = -md_size_sect + MD_AL_OFFSET; break; } - rcu_read_unlock(); } /* input size is expected to be in KB */ @@ -775,16 +581,10 @@ char *ppsize(char *buf, unsigned long long size) * R_PRIMARY D_INCONSISTENT, and C_SYNC_TARGET: * peer may not initiate a resize. */ -/* Note these are not to be confused with - * drbd_adm_suspend_io/drbd_adm_resume_io, - * which are (sub) state changes triggered by admin (drbdsetup), - * and can be long lived. - * This changes an mdev->flag, is triggered by drbd internals, - * and should be short-lived. */ void drbd_suspend_io(struct drbd_conf *mdev) { set_bit(SUSPEND_IO, &mdev->flags); - if (drbd_suspended(mdev)) + if (is_susp(mdev->state)) return; wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt)); } @@ -805,7 +605,7 @@ void drbd_resume_io(struct drbd_conf *mdev) enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds_flags flags) __must_hold(local) { sector_t prev_first_sect, prev_size; /* previous meta location */ - sector_t la_size, u_size; + sector_t la_size; sector_t size; char ppb[10]; @@ -833,10 +633,7 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds /* TODO: should only be some assert here, not (re)init... */ drbd_md_set_sector_offsets(mdev, mdev->ldev); - rcu_read_lock(); - u_size = rcu_dereference(mdev->ldev->disk_conf)->disk_size; - rcu_read_unlock(); - size = drbd_new_dev_size(mdev, mdev->ldev, u_size, flags & DDSF_FORCED); + size = drbd_new_dev_size(mdev, mdev->ldev, flags & DDSF_FORCED); if (drbd_get_capacity(mdev->this_bdev) != size || drbd_bm_capacity(mdev) != size) { @@ -899,12 +696,12 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds } sector_t -drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, - sector_t u_size, int assume_peer_has_space) +drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, int assume_peer_has_space) { sector_t p_size = mdev->p_size; /* partner's disk size. */ sector_t la_size = bdev->md.la_size_sect; /* last agreed size. */ sector_t m_size; /* my size */ + sector_t u_size = bdev->dc.disk_size; /* size requested by user. */ sector_t size = 0; m_size = drbd_get_max_capacity(bdev); @@ -953,21 +750,24 @@ drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, * failed, and 0 on success. You should call drbd_md_sync() after you called * this function. */ -static int drbd_check_al_size(struct drbd_conf *mdev, struct disk_conf *dc) +static int drbd_check_al_size(struct drbd_conf *mdev) { struct lru_cache *n, *t; struct lc_element *e; unsigned int in_use; int i; + ERR_IF(mdev->sync_conf.al_extents < 7) + mdev->sync_conf.al_extents = 127; + if (mdev->act_log && - mdev->act_log->nr_elements == dc->al_extents) + mdev->act_log->nr_elements == mdev->sync_conf.al_extents) return 0; in_use = 0; t = mdev->act_log; - n = lc_create("act_log", drbd_al_ext_cache, AL_UPDATES_PER_TRANSACTION, - dc->al_extents, sizeof(struct lc_element), 0); + n = lc_create("act_log", drbd_al_ext_cache, + mdev->sync_conf.al_extents, sizeof(struct lc_element), 0); if (n == NULL) { dev_err(DEV, "Cannot allocate act_log lru!\n"); @@ -1008,9 +808,7 @@ static void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_bio_ struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue; max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9); - rcu_read_lock(); - max_segments = rcu_dereference(mdev->ldev->disk_conf)->max_bio_bvecs; - rcu_read_unlock(); + max_segments = mdev->ldev->dc.max_bio_bvecs; put_ldev(mdev); } @@ -1054,14 +852,12 @@ void drbd_reconsider_max_bio_size(struct drbd_conf *mdev) Because new from 8.3.8 onwards the peer can use multiple BIOs for a single peer_request */ if (mdev->state.conn >= C_CONNECTED) { - if (mdev->tconn->agreed_pro_version < 94) - peer = min( mdev->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET); + if (mdev->agreed_pro_version < 94) { + peer = min(mdev->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET); /* Correct old drbd (up to 8.3.7) if it believes it can do more than 32KiB */ - else if (mdev->tconn->agreed_pro_version == 94) + } else if (mdev->agreed_pro_version == 94) peer = DRBD_MAX_SIZE_H80_PACKET; - else if (mdev->tconn->agreed_pro_version < 100) - peer = DRBD_MAX_BIO_SIZE_P95; /* drbd 8.3.8 onwards, before 8.4.0 */ - else + else /* drbd 8.3.8 onwards */ peer = DRBD_MAX_BIO_SIZE; } @@ -1076,27 +872,36 @@ void drbd_reconsider_max_bio_size(struct drbd_conf *mdev) drbd_setup_queue_param(mdev, new); } -/* Starts the worker thread */ -static void conn_reconfig_start(struct drbd_tconn *tconn) +/* serialize deconfig (worker exiting, doing cleanup) + * and reconfig (drbdsetup disk, drbdsetup net) + * + * Wait for a potentially exiting worker, then restart it, + * or start a new one. Flush any pending work, there may still be an + * after_state_change queued. + */ +static void drbd_reconfig_start(struct drbd_conf *mdev) { - drbd_thread_start(&tconn->worker); - conn_flush_workqueue(tconn); + wait_event(mdev->state_wait, !test_and_set_bit(CONFIG_PENDING, &mdev->flags)); + wait_event(mdev->state_wait, !test_bit(DEVICE_DYING, &mdev->flags)); + drbd_thread_start(&mdev->worker); + drbd_flush_workqueue(mdev); } -/* if still unconfigured, stops worker again. */ -static void conn_reconfig_done(struct drbd_tconn *tconn) +/* if still unconfigured, stops worker again. + * if configured now, clears CONFIG_PENDING. + * wakes potential waiters */ +static void drbd_reconfig_done(struct drbd_conf *mdev) { - bool stop_threads; - spin_lock_irq(&tconn->req_lock); - stop_threads = conn_all_vols_unconf(tconn) && - tconn->cstate == C_STANDALONE; - spin_unlock_irq(&tconn->req_lock); - if (stop_threads) { - /* asender is implicitly stopped by receiver - * in conn_disconnect() */ - drbd_thread_stop(&tconn->receiver); - drbd_thread_stop(&tconn->worker); - } + spin_lock_irq(&mdev->req_lock); + if (mdev->state.disk == D_DISKLESS && + mdev->state.conn == C_STANDALONE && + mdev->state.role == R_SECONDARY) { + set_bit(DEVICE_DYING, &mdev->flags); + drbd_thread_stop_nowait(&mdev->worker); + } else + clear_bit(CONFIG_PENDING, &mdev->flags); + spin_unlock_irq(&mdev->req_lock); + wake_up(&mdev->state_wait); } /* Make sure IO is suspended before calling this function(). */ @@ -1104,187 +909,42 @@ static void drbd_suspend_al(struct drbd_conf *mdev) { int s = 0; - if (!lc_try_lock(mdev->act_log)) { + if (lc_try_lock(mdev->act_log)) { + drbd_al_shrink(mdev); + lc_unlock(mdev->act_log); + } else { dev_warn(DEV, "Failed to lock al in drbd_suspend_al()\n"); return; } - drbd_al_shrink(mdev); - spin_lock_irq(&mdev->tconn->req_lock); + spin_lock_irq(&mdev->req_lock); if (mdev->state.conn < C_CONNECTED) s = !test_and_set_bit(AL_SUSPENDED, &mdev->flags); - spin_unlock_irq(&mdev->tconn->req_lock); - lc_unlock(mdev->act_log); + + spin_unlock_irq(&mdev->req_lock); if (s) dev_info(DEV, "Suspended AL updates\n"); } - -static bool should_set_defaults(struct genl_info *info) -{ - unsigned flags = ((struct drbd_genlmsghdr*)info->userhdr)->flags; - return 0 != (flags & DRBD_GENL_F_SET_DEFAULTS); -} - -static void enforce_disk_conf_limits(struct disk_conf *dc) -{ - if (dc->al_extents < DRBD_AL_EXTENTS_MIN) - dc->al_extents = DRBD_AL_EXTENTS_MIN; - if (dc->al_extents > DRBD_AL_EXTENTS_MAX) - dc->al_extents = DRBD_AL_EXTENTS_MAX; - - if (dc->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX) - dc->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX; -} - -int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) -{ - enum drbd_ret_code retcode; - struct drbd_conf *mdev; - struct disk_conf *new_disk_conf, *old_disk_conf; - struct fifo_buffer *old_plan = NULL, *new_plan = NULL; - int err, fifo_size; - - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); - if (!adm_ctx.reply_skb) - return retcode; - if (retcode != NO_ERROR) - goto out; - - mdev = adm_ctx.mdev; - - /* we also need a disk - * to change the options on */ - if (!get_ldev(mdev)) { - retcode = ERR_NO_DISK; - goto out; - } - - new_disk_conf = kmalloc(sizeof(struct disk_conf), GFP_KERNEL); - if (!new_disk_conf) { - retcode = ERR_NOMEM; - goto fail; - } - - mutex_lock(&mdev->tconn->conf_update); - old_disk_conf = mdev->ldev->disk_conf; - *new_disk_conf = *old_disk_conf; - if (should_set_defaults(info)) - set_disk_conf_defaults(new_disk_conf); - - err = disk_conf_from_attrs_for_change(new_disk_conf, info); - if (err && err != -ENOMSG) { - retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); - } - - if (!expect(new_disk_conf->resync_rate >= 1)) - new_disk_conf->resync_rate = 1; - - enforce_disk_conf_limits(new_disk_conf); - - fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ; - if (fifo_size != mdev->rs_plan_s->size) { - new_plan = fifo_alloc(fifo_size); - if (!new_plan) { - dev_err(DEV, "kmalloc of fifo_buffer failed"); - retcode = ERR_NOMEM; - goto fail_unlock; - } - } - - drbd_suspend_io(mdev); - wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); - drbd_al_shrink(mdev); - err = drbd_check_al_size(mdev, new_disk_conf); - lc_unlock(mdev->act_log); - wake_up(&mdev->al_wait); - drbd_resume_io(mdev); - - if (err) { - retcode = ERR_NOMEM; - goto fail_unlock; - } - - write_lock_irq(&global_state_lock); - retcode = drbd_resync_after_valid(mdev, new_disk_conf->resync_after); - if (retcode == NO_ERROR) { - rcu_assign_pointer(mdev->ldev->disk_conf, new_disk_conf); - drbd_resync_after_changed(mdev); - } - write_unlock_irq(&global_state_lock); - - if (retcode != NO_ERROR) - goto fail_unlock; - - if (new_plan) { - old_plan = mdev->rs_plan_s; - rcu_assign_pointer(mdev->rs_plan_s, new_plan); - } - - mutex_unlock(&mdev->tconn->conf_update); - - if (new_disk_conf->al_updates) - mdev->ldev->md.flags &= ~MDF_AL_DISABLED; - else - mdev->ldev->md.flags |= MDF_AL_DISABLED; - - if (new_disk_conf->md_flushes) - clear_bit(MD_NO_FUA, &mdev->flags); - else - set_bit(MD_NO_FUA, &mdev->flags); - - drbd_bump_write_ordering(mdev->tconn, WO_bdev_flush); - - drbd_md_sync(mdev); - - if (mdev->state.conn >= C_CONNECTED) - drbd_send_sync_param(mdev); - - synchronize_rcu(); - kfree(old_disk_conf); - kfree(old_plan); - mod_timer(&mdev->request_timer, jiffies + HZ); - goto success; - -fail_unlock: - mutex_unlock(&mdev->tconn->conf_update); - fail: - kfree(new_disk_conf); - kfree(new_plan); -success: - put_ldev(mdev); - out: - drbd_adm_finish(info, retcode); - return 0; -} - -int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) +/* does always return 0; + * interesting return code is in reply->ret_code */ +static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) { - struct drbd_conf *mdev; - int err; enum drbd_ret_code retcode; enum determine_dev_size dd; sector_t max_possible_sectors; sector_t min_md_device_sectors; struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */ - struct disk_conf *new_disk_conf = NULL; struct block_device *bdev; struct lru_cache *resync_lru = NULL; - struct fifo_buffer *new_plan = NULL; union drbd_state ns, os; enum drbd_state_rv rv; - struct net_conf *nc; - - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); - if (!adm_ctx.reply_skb) - return retcode; - if (retcode != NO_ERROR) - goto finish; + int cp_discovered = 0; + int logical_block_size; - mdev = adm_ctx.mdev; - conn_reconfig_start(mdev->tconn); + drbd_reconfig_start(mdev); /* if you want to reconfigure, please tear down first */ if (mdev->state.disk > D_DISKLESS) { @@ -1299,65 +959,47 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) /* make sure there is no leftover from previous force-detach attempts */ clear_bit(FORCE_DETACH, &mdev->flags); - clear_bit(WAS_IO_ERROR, &mdev->flags); - clear_bit(WAS_READ_ERROR, &mdev->flags); /* and no leftover from previously aborted resync or verify, either */ mdev->rs_total = 0; mdev->rs_failed = 0; atomic_set(&mdev->rs_pending_cnt, 0); - /* allocation not in the IO path, drbdsetup context */ + /* allocation not in the IO path, cqueue thread context */ nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL); if (!nbc) { retcode = ERR_NOMEM; goto fail; } - spin_lock_init(&nbc->md.uuid_lock); - new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL); - if (!new_disk_conf) { - retcode = ERR_NOMEM; - goto fail; - } - nbc->disk_conf = new_disk_conf; + nbc->dc.disk_size = DRBD_DISK_SIZE_SECT_DEF; + nbc->dc.on_io_error = DRBD_ON_IO_ERROR_DEF; + nbc->dc.fencing = DRBD_FENCING_DEF; + nbc->dc.max_bio_bvecs = DRBD_MAX_BIO_BVECS_DEF; - set_disk_conf_defaults(new_disk_conf); - err = disk_conf_from_attrs(new_disk_conf, info); - if (err) { + if (!disk_conf_from_tags(mdev, nlp->tag_list, &nbc->dc)) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); goto fail; } - enforce_disk_conf_limits(new_disk_conf); - - new_plan = fifo_alloc((new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ); - if (!new_plan) { - retcode = ERR_NOMEM; - goto fail; - } - - if (new_disk_conf->meta_dev_idx < DRBD_MD_INDEX_FLEX_INT) { + if (nbc->dc.meta_dev_idx < DRBD_MD_INDEX_FLEX_INT) { retcode = ERR_MD_IDX_INVALID; goto fail; } - rcu_read_lock(); - nc = rcu_dereference(mdev->tconn->net_conf); - if (nc) { - if (new_disk_conf->fencing == FP_STONITH && nc->wire_protocol == DRBD_PROT_A) { - rcu_read_unlock(); + if (get_net_conf(mdev)) { + int prot = mdev->net_conf->wire_protocol; + put_net_conf(mdev); + if (nbc->dc.fencing == FP_STONITH && prot == DRBD_PROT_A) { retcode = ERR_STONITH_AND_PROT_A; goto fail; } } - rcu_read_unlock(); - bdev = blkdev_get_by_path(new_disk_conf->backing_dev, + bdev = blkdev_get_by_path(nbc->dc.backing_dev, FMODE_READ | FMODE_WRITE | FMODE_EXCL, mdev); if (IS_ERR(bdev)) { - dev_err(DEV, "open(\"%s\") failed with %ld\n", new_disk_conf->backing_dev, + dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.backing_dev, PTR_ERR(bdev)); retcode = ERR_OPEN_DISK; goto fail; @@ -1372,12 +1014,12 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) * should check it for you already; but if you don't, or * someone fooled it, we need to double check here) */ - bdev = blkdev_get_by_path(new_disk_conf->meta_dev, + bdev = blkdev_get_by_path(nbc->dc.meta_dev, FMODE_READ | FMODE_WRITE | FMODE_EXCL, - (new_disk_conf->meta_dev_idx < 0) ? + (nbc->dc.meta_dev_idx < 0) ? (void *)mdev : (void *)drbd_m_holder); if (IS_ERR(bdev)) { - dev_err(DEV, "open(\"%s\") failed with %ld\n", new_disk_conf->meta_dev, + dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.meta_dev, PTR_ERR(bdev)); retcode = ERR_OPEN_MD_DISK; goto fail; @@ -1385,14 +1027,14 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) nbc->md_bdev = bdev; if ((nbc->backing_bdev == nbc->md_bdev) != - (new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_INTERNAL || - new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) { + (nbc->dc.meta_dev_idx == DRBD_MD_INDEX_INTERNAL || + nbc->dc.meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) { retcode = ERR_MD_IDX_INVALID; goto fail; } resync_lru = lc_create("resync", drbd_bm_ext_cache, - 1, 61, sizeof(struct bm_extent), + 61, sizeof(struct bm_extent), offsetof(struct bm_extent, lce)); if (!resync_lru) { retcode = ERR_NOMEM; @@ -1402,21 +1044,21 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) /* RT - for drbd_get_max_capacity() DRBD_MD_INDEX_FLEX_INT */ drbd_md_set_sector_offsets(mdev, nbc); - if (drbd_get_max_capacity(nbc) < new_disk_conf->disk_size) { + if (drbd_get_max_capacity(nbc) < nbc->dc.disk_size) { dev_err(DEV, "max capacity %llu smaller than disk size %llu\n", (unsigned long long) drbd_get_max_capacity(nbc), - (unsigned long long) new_disk_conf->disk_size); + (unsigned long long) nbc->dc.disk_size); retcode = ERR_DISK_TOO_SMALL; goto fail; } - if (new_disk_conf->meta_dev_idx < 0) { + if (nbc->dc.meta_dev_idx < 0) { max_possible_sectors = DRBD_MAX_SECTORS_FLEX; /* at least one MB, otherwise it does not make sense */ min_md_device_sectors = (2<<10); } else { max_possible_sectors = DRBD_MAX_SECTORS; - min_md_device_sectors = MD_RESERVED_SECT * (new_disk_conf->meta_dev_idx + 1); + min_md_device_sectors = MD_RESERVED_SECT * (nbc->dc.meta_dev_idx + 1); } if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) { @@ -1441,20 +1083,14 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) dev_warn(DEV, "==> truncating very big lower level device " "to currently maximum possible %llu sectors <==\n", (unsigned long long) max_possible_sectors); - if (new_disk_conf->meta_dev_idx >= 0) + if (nbc->dc.meta_dev_idx >= 0) dev_warn(DEV, "==>> using internal or flexible " "meta data may help <<==\n"); } drbd_suspend_io(mdev); /* also wait for the last barrier ack. */ - /* FIXME see also https://daiquiri.linbit/cgi-bin/bugzilla/show_bug.cgi?id=171 - * We need a way to either ignore barrier acks for barriers sent before a device - * was attached, or a way to wait for all pending barrier acks to come in. - * As barriers are counted per resource, - * we'd need to suspend io on all devices of a resource. - */ - wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt) || drbd_suspended(mdev)); + wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt) || is_susp(mdev->state)); /* and for any other previously queued work */ drbd_flush_workqueue(mdev); @@ -1469,6 +1105,25 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) drbd_md_set_sector_offsets(mdev, nbc); + /* allocate a second IO page if logical_block_size != 512 */ + logical_block_size = bdev_logical_block_size(nbc->md_bdev); + if (logical_block_size == 0) + logical_block_size = MD_SECTOR_SIZE; + + if (logical_block_size != MD_SECTOR_SIZE) { + if (!mdev->md_io_tmpp) { + struct page *page = alloc_page(GFP_NOIO); + if (!page) + goto force_diskless_dec; + + dev_warn(DEV, "Meta data's bdev logical_block_size = %d != %d\n", + logical_block_size, MD_SECTOR_SIZE); + dev_warn(DEV, "Workaround engaged (has performance impact).\n"); + + mdev->md_io_tmpp = page; + } + } + if (!mdev->bitmap) { if (drbd_bm_init(mdev)) { retcode = ERR_NOMEM; @@ -1490,25 +1145,30 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) } /* Since we are diskless, fix the activity log first... */ - if (drbd_check_al_size(mdev, new_disk_conf)) { + if (drbd_check_al_size(mdev)) { retcode = ERR_NOMEM; goto force_diskless_dec; } /* Prevent shrinking of consistent devices ! */ if (drbd_md_test_flag(nbc, MDF_CONSISTENT) && - drbd_new_dev_size(mdev, nbc, nbc->disk_conf->disk_size, 0) < nbc->md.la_size_sect) { + drbd_new_dev_size(mdev, nbc, 0) < nbc->md.la_size_sect) { dev_warn(DEV, "refusing to truncate a consistent device\n"); retcode = ERR_DISK_TOO_SMALL; goto force_diskless_dec; } + if (!drbd_al_read_log(mdev, nbc)) { + retcode = ERR_IO_MD_DISK; + goto force_diskless_dec; + } + /* Reset the "barriers don't work" bits here, then force meta data to * be written, to ensure we determine if barriers are supported. */ - if (new_disk_conf->md_flushes) - clear_bit(MD_NO_FUA, &mdev->flags); - else + if (nbc->dc.no_md_flush) set_bit(MD_NO_FUA, &mdev->flags); + else + clear_bit(MD_NO_FUA, &mdev->flags); /* Point of no return reached. * Devices and memory are no longer released by error cleanup below. @@ -1517,13 +1177,11 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) D_ASSERT(mdev->ldev == NULL); mdev->ldev = nbc; mdev->resync = resync_lru; - mdev->rs_plan_s = new_plan; nbc = NULL; resync_lru = NULL; - new_disk_conf = NULL; - new_plan = NULL; - drbd_bump_write_ordering(mdev->tconn, WO_bdev_flush); + mdev->write_ordering = WO_bdev_flush; + drbd_bump_write_ordering(mdev, WO_bdev_flush); if (drbd_md_test_flag(mdev->ldev, MDF_CRASHED_PRIMARY)) set_bit(CRASHED_PRIMARY, &mdev->flags); @@ -1531,8 +1189,10 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) clear_bit(CRASHED_PRIMARY, &mdev->flags); if (drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND) && - !(mdev->state.role == R_PRIMARY && mdev->tconn->susp_nod)) + !(mdev->state.role == R_PRIMARY && mdev->state.susp_nod)) { set_bit(CRASHED_PRIMARY, &mdev->flags); + cp_discovered = 1; + } mdev->send_cnt = 0; mdev->recv_cnt = 0; @@ -1568,9 +1228,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) } else if (dd == grew) set_bit(RESYNC_AFTER_NEG, &mdev->flags); - if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC) || - (test_bit(CRASHED_PRIMARY, &mdev->flags) && - drbd_md_test_flag(mdev->ldev, MDF_AL_DISABLED))) { + if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) { dev_info(DEV, "Assuming that all blocks are out of sync " "(aka FullSync)\n"); if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, @@ -1580,7 +1238,16 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) } } else { if (drbd_bitmap_io(mdev, &drbd_bm_read, - "read from attaching", BM_LOCKED_MASK)) { + "read from attaching", BM_LOCKED_MASK) < 0) { + retcode = ERR_IO_MD_DISK; + goto force_diskless_dec; + } + } + + if (cp_discovered) { + drbd_al_apply_to_bm(mdev); + if (drbd_bitmap_io(mdev, &drbd_bm_write, + "crashed primary apply AL", BM_LOCKED_MASK)) { retcode = ERR_IO_MD_DISK; goto force_diskless_dec; } @@ -1589,9 +1256,9 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) if (_drbd_bm_total_weight(mdev) == drbd_bm_bits(mdev)) drbd_suspend_al(mdev); /* IO is still suspended here... */ - spin_lock_irq(&mdev->tconn->req_lock); - os = drbd_read_state(mdev); - ns = os; + spin_lock_irq(&mdev->req_lock); + os = mdev->state; + ns.i = os.i; /* If MDF_CONSISTENT is not set go into inconsistent state, otherwise investigate MDF_WasUpToDate... If MDF_WAS_UP_TO_DATE is not set go into D_OUTDATED disk state, @@ -1609,9 +1276,8 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) if (drbd_md_test_flag(mdev->ldev, MDF_PEER_OUT_DATED)) ns.pdsk = D_OUTDATED; - rcu_read_lock(); - if (ns.disk == D_CONSISTENT && - (ns.pdsk == D_OUTDATED || rcu_dereference(mdev->ldev->disk_conf)->fencing == FP_DONT_CARE)) + if ( ns.disk == D_CONSISTENT && + (ns.pdsk == D_OUTDATED || mdev->ldev->dc.fencing == FP_DONT_CARE)) ns.disk = D_UP_TO_DATE; /* All tests on MDF_PRIMARY_IND, MDF_CONNECTED_IND, @@ -1619,13 +1285,6 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) this point, because drbd_request_state() modifies these flags. */ - if (rcu_dereference(mdev->ldev->disk_conf)->al_updates) - mdev->ldev->md.flags &= ~MDF_AL_DISABLED; - else - mdev->ldev->md.flags |= MDF_AL_DISABLED; - - rcu_read_unlock(); - /* In case we are C_CONNECTED postpone any decision on the new disk state after the negotiation phase. */ if (mdev->state.conn == C_CONNECTED) { @@ -1641,13 +1300,12 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) } rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL); - spin_unlock_irq(&mdev->tconn->req_lock); + ns = mdev->state; + spin_unlock_irq(&mdev->req_lock); if (rv < SS_SUCCESS) goto force_diskless_dec; - mod_timer(&mdev->request_timer, jiffies + HZ); - if (mdev->state.role == R_PRIMARY) mdev->ldev->md.uuid[UI_CURRENT] |= (u64)1; else @@ -1658,17 +1316,16 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); put_ldev(mdev); - conn_reconfig_done(mdev->tconn); - drbd_adm_finish(info, retcode); + reply->ret_code = retcode; + drbd_reconfig_done(mdev); return 0; force_diskless_dec: put_ldev(mdev); force_diskless: - drbd_force_state(mdev, NS(disk, D_DISKLESS)); + drbd_force_state(mdev, NS(disk, D_FAILED)); drbd_md_sync(mdev); fail: - conn_reconfig_done(mdev->tconn); if (nbc) { if (nbc->backing_bdev) blkdev_put(nbc->backing_bdev, @@ -1678,24 +1335,34 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) FMODE_READ | FMODE_WRITE | FMODE_EXCL); kfree(nbc); } - kfree(new_disk_conf); lc_destroy(resync_lru); - kfree(new_plan); - finish: - drbd_adm_finish(info, retcode); + reply->ret_code = retcode; + drbd_reconfig_done(mdev); return 0; } -static int adm_detach(struct drbd_conf *mdev, int force) +/* Detaching the disk is a process in multiple stages. First we need to lock + * out application IO, in-flight IO, IO stuck in drbd_al_begin_io. + * Then we transition to D_DISKLESS, and wait for put_ldev() to return all + * internal references as well. + * Only then we have finally detached. */ +static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) { - enum drbd_state_rv retcode; + enum drbd_ret_code retcode; int ret; + struct detach dt = {}; + + if (!detach_from_tags(mdev, nlp->tag_list, &dt)) { + reply->ret_code = ERR_MANDATORY_TAG; + goto out; + } - if (force) { + if (dt.detach_force) { set_bit(FORCE_DETACH, &mdev->flags); drbd_force_state(mdev, NS(disk, D_FAILED)); - retcode = SS_SUCCESS; + reply->ret_code = SS_SUCCESS; goto out; } @@ -1707,697 +1374,608 @@ static int adm_detach(struct drbd_conf *mdev, int force) ret = wait_event_interruptible(mdev->misc_wait, mdev->state.disk != D_FAILED); drbd_resume_io(mdev); + if ((int)retcode == (int)SS_IS_DISKLESS) retcode = SS_NOTHING_TO_DO; if (ret) retcode = ERR_INTR; + reply->ret_code = retcode; out: - return retcode; + return 0; } -/* Detaching the disk is a process in multiple stages. First we need to lock - * out application IO, in-flight IO, IO stuck in drbd_al_begin_io. - * Then we transition to D_DISKLESS, and wait for put_ldev() to return all - * internal references as well. - * Only then we have finally detached. */ -int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info) +static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) { + int i, ns; enum drbd_ret_code retcode; - struct detach_parms parms = { }; - int err; + struct net_conf *new_conf = NULL; + struct crypto_hash *tfm = NULL; + struct crypto_hash *integrity_w_tfm = NULL; + struct crypto_hash *integrity_r_tfm = NULL; + struct hlist_head *new_tl_hash = NULL; + struct hlist_head *new_ee_hash = NULL; + struct drbd_conf *odev; + char hmac_name[CRYPTO_MAX_ALG_NAME]; + void *int_dig_out = NULL; + void *int_dig_in = NULL; + void *int_dig_vv = NULL; + struct sockaddr *new_my_addr, *new_peer_addr, *taken_addr; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); - if (!adm_ctx.reply_skb) - return retcode; - if (retcode != NO_ERROR) - goto out; + drbd_reconfig_start(mdev); - if (info->attrs[DRBD_NLA_DETACH_PARMS]) { - err = detach_parms_from_attrs(&parms, info); - if (err) { - retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); - goto out; - } + if (mdev->state.conn > C_STANDALONE) { + retcode = ERR_NET_CONFIGURED; + goto fail; } - retcode = adm_detach(adm_ctx.mdev, parms.force_detach); -out: - drbd_adm_finish(info, retcode); - return 0; -} - -static bool conn_resync_running(struct drbd_tconn *tconn) -{ - struct drbd_conf *mdev; - bool rv = false; - int vnr; - - rcu_read_lock(); - idr_for_each_entry(&tconn->volumes, mdev, vnr) { - if (mdev->state.conn == C_SYNC_SOURCE || - mdev->state.conn == C_SYNC_TARGET || - mdev->state.conn == C_PAUSED_SYNC_S || - mdev->state.conn == C_PAUSED_SYNC_T) { - rv = true; - break; - } + /* allocation not in the IO path, cqueue thread context */ + new_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL); + if (!new_conf) { + retcode = ERR_NOMEM; + goto fail; } - rcu_read_unlock(); - - return rv; -} -static bool conn_ov_running(struct drbd_tconn *tconn) -{ - struct drbd_conf *mdev; - bool rv = false; - int vnr; - - rcu_read_lock(); - idr_for_each_entry(&tconn->volumes, mdev, vnr) { - if (mdev->state.conn == C_VERIFY_S || - mdev->state.conn == C_VERIFY_T) { - rv = true; - break; - } + new_conf->timeout = DRBD_TIMEOUT_DEF; + new_conf->try_connect_int = DRBD_CONNECT_INT_DEF; + new_conf->ping_int = DRBD_PING_INT_DEF; + new_conf->max_epoch_size = DRBD_MAX_EPOCH_SIZE_DEF; + new_conf->max_buffers = DRBD_MAX_BUFFERS_DEF; + new_conf->unplug_watermark = DRBD_UNPLUG_WATERMARK_DEF; + new_conf->sndbuf_size = DRBD_SNDBUF_SIZE_DEF; + new_conf->rcvbuf_size = DRBD_RCVBUF_SIZE_DEF; + new_conf->ko_count = DRBD_KO_COUNT_DEF; + new_conf->after_sb_0p = DRBD_AFTER_SB_0P_DEF; + new_conf->after_sb_1p = DRBD_AFTER_SB_1P_DEF; + new_conf->after_sb_2p = DRBD_AFTER_SB_2P_DEF; + new_conf->want_lose = 0; + new_conf->two_primaries = 0; + new_conf->wire_protocol = DRBD_PROT_C; + new_conf->ping_timeo = DRBD_PING_TIMEO_DEF; + new_conf->rr_conflict = DRBD_RR_CONFLICT_DEF; + new_conf->on_congestion = DRBD_ON_CONGESTION_DEF; + new_conf->cong_extents = DRBD_CONG_EXTENTS_DEF; + + if (!net_conf_from_tags(mdev, nlp->tag_list, new_conf)) { + retcode = ERR_MANDATORY_TAG; + goto fail; } - rcu_read_unlock(); - - return rv; -} - -static enum drbd_ret_code -_check_net_options(struct drbd_tconn *tconn, struct net_conf *old_conf, struct net_conf *new_conf) -{ - struct drbd_conf *mdev; - int i; - - if (old_conf && tconn->cstate == C_WF_REPORT_PARAMS && tconn->agreed_pro_version < 100) { - if (new_conf->wire_protocol != old_conf->wire_protocol) - return ERR_NEED_APV_100; - - if (new_conf->two_primaries != old_conf->two_primaries) - return ERR_NEED_APV_100; - if (strcmp(new_conf->integrity_alg, old_conf->integrity_alg)) - return ERR_NEED_APV_100; + if (new_conf->two_primaries + && (new_conf->wire_protocol != DRBD_PROT_C)) { + retcode = ERR_NOT_PROTO_C; + goto fail; } - if (!new_conf->two_primaries && - conn_highest_role(tconn) == R_PRIMARY && - conn_highest_peer(tconn) == R_PRIMARY) - return ERR_NEED_ALLOW_TWO_PRI; - - if (new_conf->two_primaries && - (new_conf->wire_protocol != DRBD_PROT_C)) - return ERR_NOT_PROTO_C; - - idr_for_each_entry(&tconn->volumes, mdev, i) { - if (get_ldev(mdev)) { - enum drbd_fencing_p fp = rcu_dereference(mdev->ldev->disk_conf)->fencing; - put_ldev(mdev); - if (new_conf->wire_protocol == DRBD_PROT_A && fp == FP_STONITH) - return ERR_STONITH_AND_PROT_A; + if (get_ldev(mdev)) { + enum drbd_fencing_p fp = mdev->ldev->dc.fencing; + put_ldev(mdev); + if (new_conf->wire_protocol == DRBD_PROT_A && fp == FP_STONITH) { + retcode = ERR_STONITH_AND_PROT_A; + goto fail; } - if (mdev->state.role == R_PRIMARY && new_conf->discard_my_data) - return ERR_DISCARD_IMPOSSIBLE; } - if (new_conf->on_congestion != OC_BLOCK && new_conf->wire_protocol != DRBD_PROT_A) - return ERR_CONG_NOT_PROTO_A; - - return NO_ERROR; -} - -static enum drbd_ret_code -check_net_options(struct drbd_tconn *tconn, struct net_conf *new_conf) -{ - static enum drbd_ret_code rv; - struct drbd_conf *mdev; - int i; - - rcu_read_lock(); - rv = _check_net_options(tconn, rcu_dereference(tconn->net_conf), new_conf); - rcu_read_unlock(); - - /* tconn->volumes protected by genl_lock() here */ - idr_for_each_entry(&tconn->volumes, mdev, i) { - if (!mdev->bitmap) { - if(drbd_bm_init(mdev)) - return ERR_NOMEM; - } + if (new_conf->on_congestion != OC_BLOCK && new_conf->wire_protocol != DRBD_PROT_A) { + retcode = ERR_CONG_NOT_PROTO_A; + goto fail; } - return rv; -} - -struct crypto { - struct crypto_hash *verify_tfm; - struct crypto_hash *csums_tfm; - struct crypto_hash *cram_hmac_tfm; - struct crypto_hash *integrity_tfm; -}; + if (mdev->state.role == R_PRIMARY && new_conf->want_lose) { + retcode = ERR_DISCARD; + goto fail; + } -static int -alloc_hash(struct crypto_hash **tfm, char *tfm_name, int err_alg) -{ - if (!tfm_name[0]) - return NO_ERROR; + retcode = NO_ERROR; - *tfm = crypto_alloc_hash(tfm_name, 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(*tfm)) { - *tfm = NULL; - return err_alg; + new_my_addr = (struct sockaddr *)&new_conf->my_addr; + new_peer_addr = (struct sockaddr *)&new_conf->peer_addr; + for (i = 0; i < minor_count; i++) { + odev = minor_to_mdev(i); + if (!odev || odev == mdev) + continue; + if (get_net_conf(odev)) { + taken_addr = (struct sockaddr *)&odev->net_conf->my_addr; + if (new_conf->my_addr_len == odev->net_conf->my_addr_len && + !memcmp(new_my_addr, taken_addr, new_conf->my_addr_len)) + retcode = ERR_LOCAL_ADDR; + + taken_addr = (struct sockaddr *)&odev->net_conf->peer_addr; + if (new_conf->peer_addr_len == odev->net_conf->peer_addr_len && + !memcmp(new_peer_addr, taken_addr, new_conf->peer_addr_len)) + retcode = ERR_PEER_ADDR; + + put_net_conf(odev); + if (retcode != NO_ERROR) + goto fail; + } } - return NO_ERROR; -} - -static enum drbd_ret_code -alloc_crypto(struct crypto *crypto, struct net_conf *new_conf) -{ - char hmac_name[CRYPTO_MAX_ALG_NAME]; - enum drbd_ret_code rv; - - rv = alloc_hash(&crypto->csums_tfm, new_conf->csums_alg, - ERR_CSUMS_ALG); - if (rv != NO_ERROR) - return rv; - rv = alloc_hash(&crypto->verify_tfm, new_conf->verify_alg, - ERR_VERIFY_ALG); - if (rv != NO_ERROR) - return rv; - rv = alloc_hash(&crypto->integrity_tfm, new_conf->integrity_alg, - ERR_INTEGRITY_ALG); - if (rv != NO_ERROR) - return rv; if (new_conf->cram_hmac_alg[0] != 0) { snprintf(hmac_name, CRYPTO_MAX_ALG_NAME, "hmac(%s)", - new_conf->cram_hmac_alg); + new_conf->cram_hmac_alg); + tfm = crypto_alloc_hash(hmac_name, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(tfm)) { + tfm = NULL; + retcode = ERR_AUTH_ALG; + goto fail; + } - rv = alloc_hash(&crypto->cram_hmac_tfm, hmac_name, - ERR_AUTH_ALG); + if (!drbd_crypto_is_hash(crypto_hash_tfm(tfm))) { + retcode = ERR_AUTH_ALG_ND; + goto fail; + } } - return rv; -} - -static void free_crypto(struct crypto *crypto) -{ - crypto_free_hash(crypto->cram_hmac_tfm); - crypto_free_hash(crypto->integrity_tfm); - crypto_free_hash(crypto->csums_tfm); - crypto_free_hash(crypto->verify_tfm); -} - -int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info) -{ - enum drbd_ret_code retcode; - struct drbd_tconn *tconn; - struct net_conf *old_conf, *new_conf = NULL; - int err; - int ovr; /* online verify running */ - int rsr; /* re-sync running */ - struct crypto crypto = { }; - - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_CONNECTION); - if (!adm_ctx.reply_skb) - return retcode; - if (retcode != NO_ERROR) - goto out; + if (new_conf->integrity_alg[0]) { + integrity_w_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(integrity_w_tfm)) { + integrity_w_tfm = NULL; + retcode=ERR_INTEGRITY_ALG; + goto fail; + } - tconn = adm_ctx.tconn; + if (!drbd_crypto_is_hash(crypto_hash_tfm(integrity_w_tfm))) { + retcode=ERR_INTEGRITY_ALG_ND; + goto fail; + } - new_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL); - if (!new_conf) { - retcode = ERR_NOMEM; - goto out; + integrity_r_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(integrity_r_tfm)) { + integrity_r_tfm = NULL; + retcode=ERR_INTEGRITY_ALG; + goto fail; + } } - conn_reconfig_start(tconn); - - mutex_lock(&tconn->data.mutex); - mutex_lock(&tconn->conf_update); - old_conf = tconn->net_conf; - - if (!old_conf) { - drbd_msg_put_info("net conf missing, try connect"); - retcode = ERR_INVALID_REQUEST; - goto fail; + ns = new_conf->max_epoch_size/8; + if (mdev->tl_hash_s != ns) { + new_tl_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL); + if (!new_tl_hash) { + retcode = ERR_NOMEM; + goto fail; + } } - *new_conf = *old_conf; - if (should_set_defaults(info)) - set_net_conf_defaults(new_conf); - - err = net_conf_from_attrs_for_change(new_conf, info); - if (err && err != -ENOMSG) { - retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); - goto fail; + ns = new_conf->max_buffers/8; + if (new_conf->two_primaries && (mdev->ee_hash_s != ns)) { + new_ee_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL); + if (!new_ee_hash) { + retcode = ERR_NOMEM; + goto fail; + } } - retcode = check_net_options(tconn, new_conf); - if (retcode != NO_ERROR) - goto fail; + ((char *)new_conf->shared_secret)[SHARED_SECRET_MAX-1] = 0; - /* re-sync running */ - rsr = conn_resync_running(tconn); - if (rsr && strcmp(new_conf->csums_alg, old_conf->csums_alg)) { - retcode = ERR_CSUMS_RESYNC_RUNNING; - goto fail; + if (integrity_w_tfm) { + i = crypto_hash_digestsize(integrity_w_tfm); + int_dig_out = kmalloc(i, GFP_KERNEL); + if (!int_dig_out) { + retcode = ERR_NOMEM; + goto fail; + } + int_dig_in = kmalloc(i, GFP_KERNEL); + if (!int_dig_in) { + retcode = ERR_NOMEM; + goto fail; + } + int_dig_vv = kmalloc(i, GFP_KERNEL); + if (!int_dig_vv) { + retcode = ERR_NOMEM; + goto fail; + } } - /* online verify running */ - ovr = conn_ov_running(tconn); - if (ovr && strcmp(new_conf->verify_alg, old_conf->verify_alg)) { - retcode = ERR_VERIFY_RUNNING; - goto fail; + if (!mdev->bitmap) { + if(drbd_bm_init(mdev)) { + retcode = ERR_NOMEM; + goto fail; + } } - retcode = alloc_crypto(&crypto, new_conf); - if (retcode != NO_ERROR) + drbd_flush_workqueue(mdev); + spin_lock_irq(&mdev->req_lock); + if (mdev->net_conf != NULL) { + retcode = ERR_NET_CONFIGURED; + spin_unlock_irq(&mdev->req_lock); goto fail; + } + mdev->net_conf = new_conf; - rcu_assign_pointer(tconn->net_conf, new_conf); + mdev->send_cnt = 0; + mdev->recv_cnt = 0; - if (!rsr) { - crypto_free_hash(tconn->csums_tfm); - tconn->csums_tfm = crypto.csums_tfm; - crypto.csums_tfm = NULL; + if (new_tl_hash) { + kfree(mdev->tl_hash); + mdev->tl_hash_s = mdev->net_conf->max_epoch_size/8; + mdev->tl_hash = new_tl_hash; } - if (!ovr) { - crypto_free_hash(tconn->verify_tfm); - tconn->verify_tfm = crypto.verify_tfm; - crypto.verify_tfm = NULL; + + if (new_ee_hash) { + kfree(mdev->ee_hash); + mdev->ee_hash_s = mdev->net_conf->max_buffers/8; + mdev->ee_hash = new_ee_hash; } - crypto_free_hash(tconn->integrity_tfm); - tconn->integrity_tfm = crypto.integrity_tfm; - if (tconn->cstate >= C_WF_REPORT_PARAMS && tconn->agreed_pro_version >= 100) - /* Do this without trying to take tconn->data.mutex again. */ - __drbd_send_protocol(tconn, P_PROTOCOL_UPDATE); + crypto_free_hash(mdev->cram_hmac_tfm); + mdev->cram_hmac_tfm = tfm; - crypto_free_hash(tconn->cram_hmac_tfm); - tconn->cram_hmac_tfm = crypto.cram_hmac_tfm; + crypto_free_hash(mdev->integrity_w_tfm); + mdev->integrity_w_tfm = integrity_w_tfm; - mutex_unlock(&tconn->conf_update); - mutex_unlock(&tconn->data.mutex); - synchronize_rcu(); - kfree(old_conf); + crypto_free_hash(mdev->integrity_r_tfm); + mdev->integrity_r_tfm = integrity_r_tfm; - if (tconn->cstate >= C_WF_REPORT_PARAMS) - drbd_send_sync_param(minor_to_mdev(conn_lowest_minor(tconn))); + kfree(mdev->int_dig_out); + kfree(mdev->int_dig_in); + kfree(mdev->int_dig_vv); + mdev->int_dig_out=int_dig_out; + mdev->int_dig_in=int_dig_in; + mdev->int_dig_vv=int_dig_vv; + retcode = _drbd_set_state(_NS(mdev, conn, C_UNCONNECTED), CS_VERBOSE, NULL); + spin_unlock_irq(&mdev->req_lock); - goto done; + kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); + reply->ret_code = retcode; + drbd_reconfig_done(mdev); + return 0; - fail: - mutex_unlock(&tconn->conf_update); - mutex_unlock(&tconn->data.mutex); - free_crypto(&crypto); +fail: + kfree(int_dig_out); + kfree(int_dig_in); + kfree(int_dig_vv); + crypto_free_hash(tfm); + crypto_free_hash(integrity_w_tfm); + crypto_free_hash(integrity_r_tfm); + kfree(new_tl_hash); + kfree(new_ee_hash); kfree(new_conf); - done: - conn_reconfig_done(tconn); - out: - drbd_adm_finish(info, retcode); + + reply->ret_code = retcode; + drbd_reconfig_done(mdev); return 0; } -int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) +static int drbd_nl_disconnect(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) { - struct drbd_conf *mdev; - struct net_conf *old_conf, *new_conf = NULL; - struct crypto crypto = { }; - struct drbd_tconn *tconn; - enum drbd_ret_code retcode; - int i; - int err; + int retcode; + struct disconnect dc; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE); + memset(&dc, 0, sizeof(struct disconnect)); + if (!disconnect_from_tags(mdev, nlp->tag_list, &dc)) { + retcode = ERR_MANDATORY_TAG; + goto fail; + } - if (!adm_ctx.reply_skb) - return retcode; - if (retcode != NO_ERROR) - goto out; - if (!(adm_ctx.my_addr && adm_ctx.peer_addr)) { - drbd_msg_put_info("connection endpoint(s) missing"); - retcode = ERR_INVALID_REQUEST; - goto out; + if (dc.force) { + spin_lock_irq(&mdev->req_lock); + if (mdev->state.conn >= C_WF_CONNECTION) + _drbd_set_state(_NS(mdev, conn, C_DISCONNECTING), CS_HARD, NULL); + spin_unlock_irq(&mdev->req_lock); + goto done; } - /* No need for _rcu here. All reconfiguration is - * strictly serialized on genl_lock(). We are protected against - * concurrent reconfiguration/addition/deletion */ - list_for_each_entry(tconn, &drbd_tconns, all_tconn) { - if (nla_len(adm_ctx.my_addr) == tconn->my_addr_len && - !memcmp(nla_data(adm_ctx.my_addr), &tconn->my_addr, tconn->my_addr_len)) { - retcode = ERR_LOCAL_ADDR; - goto out; - } + retcode = _drbd_request_state(mdev, NS(conn, C_DISCONNECTING), CS_ORDERED); - if (nla_len(adm_ctx.peer_addr) == tconn->peer_addr_len && - !memcmp(nla_data(adm_ctx.peer_addr), &tconn->peer_addr, tconn->peer_addr_len)) { - retcode = ERR_PEER_ADDR; - goto out; + if (retcode == SS_NOTHING_TO_DO) + goto done; + else if (retcode == SS_ALREADY_STANDALONE) + goto done; + else if (retcode == SS_PRIMARY_NOP) { + /* Our statche checking code wants to see the peer outdated. */ + retcode = drbd_request_state(mdev, NS2(conn, C_DISCONNECTING, + pdsk, D_OUTDATED)); + } else if (retcode == SS_CW_FAILED_BY_PEER) { + /* The peer probably wants to see us outdated. */ + retcode = _drbd_request_state(mdev, NS2(conn, C_DISCONNECTING, + disk, D_OUTDATED), + CS_ORDERED); + if (retcode == SS_IS_DISKLESS || retcode == SS_LOWER_THAN_OUTDATED) { + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); + retcode = SS_SUCCESS; } } - tconn = adm_ctx.tconn; - conn_reconfig_start(tconn); - - if (tconn->cstate > C_STANDALONE) { - retcode = ERR_NET_CONFIGURED; + if (retcode < SS_SUCCESS) goto fail; - } - /* allocation not in the IO path, drbdsetup / netlink process context */ - new_conf = kzalloc(sizeof(*new_conf), GFP_KERNEL); - if (!new_conf) { - retcode = ERR_NOMEM; + if (wait_event_interruptible(mdev->state_wait, + mdev->state.conn != C_DISCONNECTING)) { + /* Do not test for mdev->state.conn == C_STANDALONE, since + someone else might connect us in the mean time! */ + retcode = ERR_INTR; goto fail; } - set_net_conf_defaults(new_conf); + done: + retcode = NO_ERROR; + fail: + drbd_md_sync(mdev); + reply->ret_code = retcode; + return 0; +} + +void resync_after_online_grow(struct drbd_conf *mdev) +{ + int iass; /* I am sync source */ + + dev_info(DEV, "Resync of new storage after online grow\n"); + if (mdev->state.role != mdev->state.peer) + iass = (mdev->state.role == R_PRIMARY); + else + iass = test_bit(DISCARD_CONCURRENT, &mdev->flags); + + if (iass) + drbd_start_resync(mdev, C_SYNC_SOURCE); + else + _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE + CS_SERIALIZE); +} + +static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + struct resize rs; + int retcode = NO_ERROR; + enum determine_dev_size dd; + enum dds_flags ddsf; - err = net_conf_from_attrs(new_conf, info); - if (err && err != -ENOMSG) { + memset(&rs, 0, sizeof(struct resize)); + if (!resize_from_tags(mdev, nlp->tag_list, &rs)) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); goto fail; } - retcode = check_net_options(tconn, new_conf); - if (retcode != NO_ERROR) + if (mdev->state.conn > C_CONNECTED) { + retcode = ERR_RESIZE_RESYNC; goto fail; + } - retcode = alloc_crypto(&crypto, new_conf); - if (retcode != NO_ERROR) + if (mdev->state.role == R_SECONDARY && + mdev->state.peer == R_SECONDARY) { + retcode = ERR_NO_PRIMARY; goto fail; + } - ((char *)new_conf->shared_secret)[SHARED_SECRET_MAX-1] = 0; - - conn_flush_workqueue(tconn); - - mutex_lock(&tconn->conf_update); - old_conf = tconn->net_conf; - if (old_conf) { - retcode = ERR_NET_CONFIGURED; - mutex_unlock(&tconn->conf_update); + if (!get_ldev(mdev)) { + retcode = ERR_NO_DISK; goto fail; } - rcu_assign_pointer(tconn->net_conf, new_conf); - - conn_free_crypto(tconn); - tconn->cram_hmac_tfm = crypto.cram_hmac_tfm; - tconn->integrity_tfm = crypto.integrity_tfm; - tconn->csums_tfm = crypto.csums_tfm; - tconn->verify_tfm = crypto.verify_tfm; - - tconn->my_addr_len = nla_len(adm_ctx.my_addr); - memcpy(&tconn->my_addr, nla_data(adm_ctx.my_addr), tconn->my_addr_len); - tconn->peer_addr_len = nla_len(adm_ctx.peer_addr); - memcpy(&tconn->peer_addr, nla_data(adm_ctx.peer_addr), tconn->peer_addr_len); - mutex_unlock(&tconn->conf_update); - - rcu_read_lock(); - idr_for_each_entry(&tconn->volumes, mdev, i) { - mdev->send_cnt = 0; - mdev->recv_cnt = 0; + if (rs.no_resync && mdev->agreed_pro_version < 93) { + retcode = ERR_NEED_APV_93; + goto fail_ldev; } - rcu_read_unlock(); - - retcode = conn_request_state(tconn, NS(conn, C_UNCONNECTED), CS_VERBOSE); - conn_reconfig_done(tconn); - drbd_adm_finish(info, retcode); - return 0; + if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) + mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev); -fail: - free_crypto(&crypto); - kfree(new_conf); + mdev->ldev->dc.disk_size = (sector_t)rs.resize_size; + ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0); + dd = drbd_determine_dev_size(mdev, ddsf); + drbd_md_sync(mdev); + put_ldev(mdev); + if (dd == dev_size_error) { + retcode = ERR_NOMEM_BITMAP; + goto fail; + } - conn_reconfig_done(tconn); -out: - drbd_adm_finish(info, retcode); + if (mdev->state.conn == C_CONNECTED) { + if (dd == grew) + set_bit(RESIZE_PENDING, &mdev->flags); + + drbd_send_uuids(mdev); + drbd_send_sizes(mdev, 1, ddsf); + } + + fail: + reply->ret_code = retcode; return 0; + + fail_ldev: + put_ldev(mdev); + goto fail; } -static enum drbd_state_rv conn_try_disconnect(struct drbd_tconn *tconn, bool force) +static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) { - enum drbd_state_rv rv; + int retcode = NO_ERROR; + int err; + int ovr; /* online verify running */ + int rsr; /* re-sync running */ + struct crypto_hash *verify_tfm = NULL; + struct crypto_hash *csums_tfm = NULL; + struct syncer_conf sc; + cpumask_var_t new_cpu_mask; + int *rs_plan_s = NULL; + int fifo_size; + + if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL)) { + retcode = ERR_NOMEM; + goto fail; + } - rv = conn_request_state(tconn, NS(conn, C_DISCONNECTING), - force ? CS_HARD : 0); + if (nlp->flags & DRBD_NL_SET_DEFAULTS) { + memset(&sc, 0, sizeof(struct syncer_conf)); + sc.rate = DRBD_RATE_DEF; + sc.after = DRBD_AFTER_DEF; + sc.al_extents = DRBD_AL_EXTENTS_DEF; + sc.on_no_data = DRBD_ON_NO_DATA_DEF; + sc.c_plan_ahead = DRBD_C_PLAN_AHEAD_DEF; + sc.c_delay_target = DRBD_C_DELAY_TARGET_DEF; + sc.c_fill_target = DRBD_C_FILL_TARGET_DEF; + sc.c_max_rate = DRBD_C_MAX_RATE_DEF; + sc.c_min_rate = DRBD_C_MIN_RATE_DEF; + } else + memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf)); - switch (rv) { - case SS_NOTHING_TO_DO: - break; - case SS_ALREADY_STANDALONE: - return SS_SUCCESS; - case SS_PRIMARY_NOP: - /* Our state checking code wants to see the peer outdated. */ - rv = conn_request_state(tconn, NS2(conn, C_DISCONNECTING, - pdsk, D_OUTDATED), CS_VERBOSE); - break; - case SS_CW_FAILED_BY_PEER: - /* The peer probably wants to see us outdated. */ - rv = conn_request_state(tconn, NS2(conn, C_DISCONNECTING, - disk, D_OUTDATED), 0); - if (rv == SS_IS_DISKLESS || rv == SS_LOWER_THAN_OUTDATED) { - rv = conn_request_state(tconn, NS(conn, C_DISCONNECTING), - CS_HARD); - } - break; - default:; - /* no special handling necessary */ - } - - if (rv >= SS_SUCCESS) { - enum drbd_state_rv rv2; - /* No one else can reconfigure the network while I am here. - * The state handling only uses drbd_thread_stop_nowait(), - * we want to really wait here until the receiver is no more. - */ - drbd_thread_stop(&adm_ctx.tconn->receiver); - - /* Race breaker. This additional state change request may be - * necessary, if this was a forced disconnect during a receiver - * restart. We may have "killed" the receiver thread just - * after drbdd_init() returned. Typically, we should be - * C_STANDALONE already, now, and this becomes a no-op. - */ - rv2 = conn_request_state(tconn, NS(conn, C_STANDALONE), - CS_VERBOSE | CS_HARD); - if (rv2 < SS_SUCCESS) - conn_err(tconn, - "unexpected rv2=%d in conn_try_disconnect()\n", - rv2); + if (!syncer_conf_from_tags(mdev, nlp->tag_list, &sc)) { + retcode = ERR_MANDATORY_TAG; + goto fail; } - return rv; -} -int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info) -{ - struct disconnect_parms parms; - struct drbd_tconn *tconn; - enum drbd_state_rv rv; - enum drbd_ret_code retcode; - int err; + /* re-sync running */ + rsr = ( mdev->state.conn == C_SYNC_SOURCE || + mdev->state.conn == C_SYNC_TARGET || + mdev->state.conn == C_PAUSED_SYNC_S || + mdev->state.conn == C_PAUSED_SYNC_T ); - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_CONNECTION); - if (!adm_ctx.reply_skb) - return retcode; - if (retcode != NO_ERROR) + if (rsr && strcmp(sc.csums_alg, mdev->sync_conf.csums_alg)) { + retcode = ERR_CSUMS_RESYNC_RUNNING; goto fail; + } - tconn = adm_ctx.tconn; - memset(&parms, 0, sizeof(parms)); - if (info->attrs[DRBD_NLA_DISCONNECT_PARMS]) { - err = disconnect_parms_from_attrs(&parms, info); - if (err) { - retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); + if (!rsr && sc.csums_alg[0]) { + csums_tfm = crypto_alloc_hash(sc.csums_alg, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(csums_tfm)) { + csums_tfm = NULL; + retcode = ERR_CSUMS_ALG; goto fail; } - } - rv = conn_try_disconnect(tconn, parms.force_disconnect); - if (rv < SS_SUCCESS) - retcode = rv; /* FIXME: Type mismatch. */ - else - retcode = NO_ERROR; - fail: - drbd_adm_finish(info, retcode); - return 0; -} - -void resync_after_online_grow(struct drbd_conf *mdev) -{ - int iass; /* I am sync source */ + if (!drbd_crypto_is_hash(crypto_hash_tfm(csums_tfm))) { + retcode = ERR_CSUMS_ALG_ND; + goto fail; + } + } - dev_info(DEV, "Resync of new storage after online grow\n"); - if (mdev->state.role != mdev->state.peer) - iass = (mdev->state.role == R_PRIMARY); - else - iass = test_bit(RESOLVE_CONFLICTS, &mdev->tconn->flags); + /* online verify running */ + ovr = (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T); - if (iass) - drbd_start_resync(mdev, C_SYNC_SOURCE); - else - _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE + CS_SERIALIZE); -} + if (ovr) { + if (strcmp(sc.verify_alg, mdev->sync_conf.verify_alg)) { + retcode = ERR_VERIFY_RUNNING; + goto fail; + } + } -int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) -{ - struct disk_conf *old_disk_conf, *new_disk_conf = NULL; - struct resize_parms rs; - struct drbd_conf *mdev; - enum drbd_ret_code retcode; - enum determine_dev_size dd; - enum dds_flags ddsf; - sector_t u_size; - int err; + if (!ovr && sc.verify_alg[0]) { + verify_tfm = crypto_alloc_hash(sc.verify_alg, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(verify_tfm)) { + verify_tfm = NULL; + retcode = ERR_VERIFY_ALG; + goto fail; + } - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); - if (!adm_ctx.reply_skb) - return retcode; - if (retcode != NO_ERROR) - goto fail; + if (!drbd_crypto_is_hash(crypto_hash_tfm(verify_tfm))) { + retcode = ERR_VERIFY_ALG_ND; + goto fail; + } + } - memset(&rs, 0, sizeof(struct resize_parms)); - if (info->attrs[DRBD_NLA_RESIZE_PARMS]) { - err = resize_parms_from_attrs(&rs, info); + /* silently ignore cpu mask on UP kernel */ + if (nr_cpu_ids > 1 && sc.cpu_mask[0] != 0) { + err = bitmap_parse(sc.cpu_mask, 32, + cpumask_bits(new_cpu_mask), nr_cpu_ids); if (err) { - retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); + dev_warn(DEV, "bitmap_parse() failed with %d\n", err); + retcode = ERR_CPU_MASK_PARSE; goto fail; } } - mdev = adm_ctx.mdev; - if (mdev->state.conn > C_CONNECTED) { - retcode = ERR_RESIZE_RESYNC; - goto fail; + ERR_IF (sc.rate < 1) sc.rate = 1; + ERR_IF (sc.al_extents < 7) sc.al_extents = 127; /* arbitrary minimum */ +#define AL_MAX ((MD_AL_MAX_SIZE-1) * AL_EXTENTS_PT) + if (sc.al_extents > AL_MAX) { + dev_err(DEV, "sc.al_extents > %d\n", AL_MAX); + sc.al_extents = AL_MAX; } +#undef AL_MAX - if (mdev->state.role == R_SECONDARY && - mdev->state.peer == R_SECONDARY) { - retcode = ERR_NO_PRIMARY; - goto fail; - } + /* to avoid spurious errors when configuring minors before configuring + * the minors they depend on: if necessary, first create the minor we + * depend on */ + if (sc.after >= 0) + ensure_mdev(sc.after, 1); - if (!get_ldev(mdev)) { - retcode = ERR_NO_DISK; + /* most sanity checks done, try to assign the new sync-after + * dependency. need to hold the global lock in there, + * to avoid a race in the dependency loop check. */ + retcode = drbd_alter_sa(mdev, sc.after); + if (retcode != NO_ERROR) goto fail; - } - if (rs.no_resync && mdev->tconn->agreed_pro_version < 93) { - retcode = ERR_NEED_APV_93; - goto fail_ldev; - } - - rcu_read_lock(); - u_size = rcu_dereference(mdev->ldev->disk_conf)->disk_size; - rcu_read_unlock(); - if (u_size != (sector_t)rs.resize_size) { - new_disk_conf = kmalloc(sizeof(struct disk_conf), GFP_KERNEL); - if (!new_disk_conf) { + fifo_size = (sc.c_plan_ahead * 10 * SLEEP_TIME) / HZ; + if (fifo_size != mdev->rs_plan_s.size && fifo_size > 0) { + rs_plan_s = kzalloc(sizeof(int) * fifo_size, GFP_KERNEL); + if (!rs_plan_s) { + dev_err(DEV, "kmalloc of fifo_buffer failed"); retcode = ERR_NOMEM; - goto fail_ldev; + goto fail; } } - if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) - mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev); + /* ok, assign the rest of it as well. + * lock against receive_SyncParam() */ + spin_lock(&mdev->peer_seq_lock); + mdev->sync_conf = sc; - if (new_disk_conf) { - mutex_lock(&mdev->tconn->conf_update); - old_disk_conf = mdev->ldev->disk_conf; - *new_disk_conf = *old_disk_conf; - new_disk_conf->disk_size = (sector_t)rs.resize_size; - rcu_assign_pointer(mdev->ldev->disk_conf, new_disk_conf); - mutex_unlock(&mdev->tconn->conf_update); - synchronize_rcu(); - kfree(old_disk_conf); + if (!rsr) { + crypto_free_hash(mdev->csums_tfm); + mdev->csums_tfm = csums_tfm; + csums_tfm = NULL; } - ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0); - dd = drbd_determine_dev_size(mdev, ddsf); - drbd_md_sync(mdev); - put_ldev(mdev); - if (dd == dev_size_error) { - retcode = ERR_NOMEM_BITMAP; - goto fail; + if (!ovr) { + crypto_free_hash(mdev->verify_tfm); + mdev->verify_tfm = verify_tfm; + verify_tfm = NULL; } - if (mdev->state.conn == C_CONNECTED) { - if (dd == grew) - set_bit(RESIZE_PENDING, &mdev->flags); - - drbd_send_uuids(mdev); - drbd_send_sizes(mdev, 1, ddsf); + if (fifo_size != mdev->rs_plan_s.size) { + kfree(mdev->rs_plan_s.values); + mdev->rs_plan_s.values = rs_plan_s; + mdev->rs_plan_s.size = fifo_size; + mdev->rs_planed = 0; + rs_plan_s = NULL; } - fail: - drbd_adm_finish(info, retcode); - return 0; + spin_unlock(&mdev->peer_seq_lock); - fail_ldev: - put_ldev(mdev); - goto fail; -} + if (get_ldev(mdev)) { + wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); + drbd_al_shrink(mdev); + err = drbd_check_al_size(mdev); + lc_unlock(mdev->act_log); + wake_up(&mdev->al_wait); -int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info) -{ - enum drbd_ret_code retcode; - struct drbd_tconn *tconn; - struct res_opts res_opts; - int err; - - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE); - if (!adm_ctx.reply_skb) - return retcode; - if (retcode != NO_ERROR) - goto fail; - tconn = adm_ctx.tconn; - - res_opts = tconn->res_opts; - if (should_set_defaults(info)) - set_res_opts_defaults(&res_opts); + put_ldev(mdev); + drbd_md_sync(mdev); - err = res_opts_from_attrs(&res_opts, info); - if (err && err != -ENOMSG) { - retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); - goto fail; + if (err) { + retcode = ERR_NOMEM; + goto fail; + } } - err = set_resource_options(tconn, &res_opts); - if (err) { - retcode = ERR_INVALID_REQUEST; - if (err == -ENOMEM) - retcode = ERR_NOMEM; + if (mdev->state.conn >= C_CONNECTED) + drbd_send_sync_param(mdev, &sc); + + if (!cpumask_equal(mdev->cpu_mask, new_cpu_mask)) { + cpumask_copy(mdev->cpu_mask, new_cpu_mask); + drbd_calc_cpu_mask(mdev); + mdev->receiver.reset_cpu_mask = 1; + mdev->asender.reset_cpu_mask = 1; + mdev->worker.reset_cpu_mask = 1; } + kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); fail: - drbd_adm_finish(info, retcode); + kfree(rs_plan_s); + free_cpumask_var(new_cpu_mask); + crypto_free_hash(csums_tfm); + crypto_free_hash(verify_tfm); + reply->ret_code = retcode; return 0; } -int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info) +static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) { - struct drbd_conf *mdev; - int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */ - - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); - if (!adm_ctx.reply_skb) - return retcode; - if (retcode != NO_ERROR) - goto out; - - mdev = adm_ctx.mdev; + int retcode; /* If there is still bitmap IO pending, probably because of a previous * resync just being finished, wait for it before requesting a new resync. @@ -2412,10 +1990,10 @@ int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info) retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T)); while (retcode == SS_NEED_CONNECTION) { - spin_lock_irq(&mdev->tconn->req_lock); + spin_lock_irq(&mdev->req_lock); if (mdev->state.conn < C_CONNECTED) retcode = _drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_VERBOSE, NULL); - spin_unlock_irq(&mdev->tconn->req_lock); + spin_unlock_irq(&mdev->req_lock); if (retcode != SS_NEED_CONNECTION) break; @@ -2424,25 +2002,7 @@ int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info) } drbd_resume_io(mdev); -out: - drbd_adm_finish(info, retcode); - return 0; -} - -static int drbd_adm_simple_request_state(struct sk_buff *skb, struct genl_info *info, - union drbd_state mask, union drbd_state val) -{ - enum drbd_ret_code retcode; - - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); - if (!adm_ctx.reply_skb) - return retcode; - if (retcode != NO_ERROR) - goto out; - - retcode = drbd_request_state(adm_ctx.mdev, mask, val); -out: - drbd_adm_finish(info, retcode); + reply->ret_code = retcode; return 0; } @@ -2455,18 +2015,10 @@ static int drbd_bmio_set_susp_al(struct drbd_conf *mdev) return rv; } -int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info) +static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) { - int retcode; /* drbd_ret_code, drbd_state_rv */ - struct drbd_conf *mdev; - - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); - if (!adm_ctx.reply_skb) - return retcode; - if (retcode != NO_ERROR) - goto out; - - mdev = adm_ctx.mdev; + int retcode; /* If there is still bitmap IO pending, probably because of a previous * resync just being finished, wait for it before requesting a new resync. @@ -2476,15 +2028,16 @@ int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info) drbd_flush_workqueue(mdev); retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S), CS_ORDERED); + if (retcode < SS_SUCCESS) { if (retcode == SS_NEED_CONNECTION && mdev->state.role == R_PRIMARY) { - /* The peer will get a resync upon connect anyways. - * Just make that into a full resync. */ + /* The peer will get a resync upon connect anyways. Just make that + into a full resync. */ retcode = drbd_request_state(mdev, NS(pdsk, D_INCONSISTENT)); if (retcode >= SS_SUCCESS) { if (drbd_bitmap_io(mdev, &drbd_bmio_set_susp_al, - "set_n_write from invalidate_peer", - BM_LOCKED_SET_ALLOWED)) + "set_n_write from invalidate_peer", + BM_LOCKED_SET_ALLOWED)) retcode = ERR_IO_MD_DISK; } } else @@ -2492,41 +2045,30 @@ int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info) } drbd_resume_io(mdev); -out: - drbd_adm_finish(info, retcode); + reply->ret_code = retcode; return 0; } -int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info) +static int drbd_nl_pause_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) { - enum drbd_ret_code retcode; - - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); - if (!adm_ctx.reply_skb) - return retcode; - if (retcode != NO_ERROR) - goto out; + int retcode = NO_ERROR; - if (drbd_request_state(adm_ctx.mdev, NS(user_isp, 1)) == SS_NOTHING_TO_DO) + if (drbd_request_state(mdev, NS(user_isp, 1)) == SS_NOTHING_TO_DO) retcode = ERR_PAUSE_IS_SET; -out: - drbd_adm_finish(info, retcode); + + reply->ret_code = retcode; return 0; } -int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info) +static int drbd_nl_resume_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) { - union drbd_dev_state s; - enum drbd_ret_code retcode; + int retcode = NO_ERROR; + union drbd_state s; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); - if (!adm_ctx.reply_skb) - return retcode; - if (retcode != NO_ERROR) - goto out; - - if (drbd_request_state(adm_ctx.mdev, NS(user_isp, 0)) == SS_NOTHING_TO_DO) { - s = adm_ctx.mdev->state; + if (drbd_request_state(mdev, NS(user_isp, 0)) == SS_NOTHING_TO_DO) { + s = mdev->state; if (s.conn == C_PAUSED_SYNC_S || s.conn == C_PAUSED_SYNC_T) { retcode = s.aftr_isp ? ERR_PIC_AFTER_DEP : s.peer_isp ? ERR_PIC_PEER_DEP : ERR_PAUSE_IS_CLEAR; @@ -2535,482 +2077,172 @@ int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info) } } -out: - drbd_adm_finish(info, retcode); + reply->ret_code = retcode; return 0; } -int drbd_adm_suspend_io(struct sk_buff *skb, struct genl_info *info) +static int drbd_nl_suspend_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) { - return drbd_adm_simple_request_state(skb, info, NS(susp, 1)); + reply->ret_code = drbd_request_state(mdev, NS(susp, 1)); + + return 0; } -int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info) +static int drbd_nl_resume_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) { - struct drbd_conf *mdev; - int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */ - - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); - if (!adm_ctx.reply_skb) - return retcode; - if (retcode != NO_ERROR) - goto out; - - mdev = adm_ctx.mdev; if (test_bit(NEW_CUR_UUID, &mdev->flags)) { drbd_uuid_new_current(mdev); clear_bit(NEW_CUR_UUID, &mdev->flags); } drbd_suspend_io(mdev); - retcode = drbd_request_state(mdev, NS3(susp, 0, susp_nod, 0, susp_fen, 0)); - if (retcode == SS_SUCCESS) { + reply->ret_code = drbd_request_state(mdev, NS3(susp, 0, susp_nod, 0, susp_fen, 0)); + if (reply->ret_code == SS_SUCCESS) { if (mdev->state.conn < C_CONNECTED) - tl_clear(mdev->tconn); + tl_clear(mdev); if (mdev->state.disk == D_DISKLESS || mdev->state.disk == D_FAILED) - tl_restart(mdev->tconn, FAIL_FROZEN_DISK_IO); + tl_restart(mdev, fail_frozen_disk_io); } drbd_resume_io(mdev); -out: - drbd_adm_finish(info, retcode); return 0; } -int drbd_adm_outdate(struct sk_buff *skb, struct genl_info *info) -{ - return drbd_adm_simple_request_state(skb, info, NS(disk, D_OUTDATED)); -} - -int nla_put_drbd_cfg_context(struct sk_buff *skb, struct drbd_tconn *tconn, unsigned vnr) +static int drbd_nl_outdate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) { - struct nlattr *nla; - nla = nla_nest_start(skb, DRBD_NLA_CFG_CONTEXT); - if (!nla) - goto nla_put_failure; - if (vnr != VOLUME_UNSPECIFIED && - nla_put_u32(skb, T_ctx_volume, vnr)) - goto nla_put_failure; - if (nla_put_string(skb, T_ctx_resource_name, tconn->name)) - goto nla_put_failure; - if (tconn->my_addr_len && - nla_put(skb, T_ctx_my_addr, tconn->my_addr_len, &tconn->my_addr)) - goto nla_put_failure; - if (tconn->peer_addr_len && - nla_put(skb, T_ctx_peer_addr, tconn->peer_addr_len, &tconn->peer_addr)) - goto nla_put_failure; - nla_nest_end(skb, nla); + reply->ret_code = drbd_request_state(mdev, NS(disk, D_OUTDATED)); return 0; - -nla_put_failure: - if (nla) - nla_nest_cancel(skb, nla); - return -EMSGSIZE; } -int nla_put_status_info(struct sk_buff *skb, struct drbd_conf *mdev, - const struct sib_info *sib) +static int drbd_nl_get_config(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) { - struct state_info *si = NULL; /* for sizeof(si->member); */ - struct net_conf *nc; - struct nlattr *nla; - int got_ldev; - int err = 0; - int exclude_sensitive; - - /* If sib != NULL, this is drbd_bcast_event, which anyone can listen - * to. So we better exclude_sensitive information. - * - * If sib == NULL, this is drbd_adm_get_status, executed synchronously - * in the context of the requesting user process. Exclude sensitive - * information, unless current has superuser. - * - * NOTE: for drbd_adm_get_status_all(), this is a netlink dump, and - * relies on the current implementation of netlink_dump(), which - * executes the dump callback successively from netlink_recvmsg(), - * always in the context of the receiving process */ - exclude_sensitive = sib || !capable(CAP_SYS_ADMIN); - - got_ldev = get_ldev(mdev); - - /* We need to add connection name and volume number information still. - * Minor number is in drbd_genlmsghdr. */ - if (nla_put_drbd_cfg_context(skb, mdev->tconn, mdev->vnr)) - goto nla_put_failure; - - if (res_opts_to_skb(skb, &mdev->tconn->res_opts, exclude_sensitive)) - goto nla_put_failure; - - rcu_read_lock(); - if (got_ldev) - if (disk_conf_to_skb(skb, rcu_dereference(mdev->ldev->disk_conf), exclude_sensitive)) - goto nla_put_failure; - - nc = rcu_dereference(mdev->tconn->net_conf); - if (nc) - err = net_conf_to_skb(skb, nc, exclude_sensitive); - rcu_read_unlock(); - if (err) - goto nla_put_failure; - - nla = nla_nest_start(skb, DRBD_NLA_STATE_INFO); - if (!nla) - goto nla_put_failure; - if (nla_put_u32(skb, T_sib_reason, sib ? sib->sib_reason : SIB_GET_STATUS_REPLY) || - nla_put_u32(skb, T_current_state, mdev->state.i) || - nla_put_u64(skb, T_ed_uuid, mdev->ed_uuid) || - nla_put_u64(skb, T_capacity, drbd_get_capacity(mdev->this_bdev)) || - nla_put_u64(skb, T_send_cnt, mdev->send_cnt) || - nla_put_u64(skb, T_recv_cnt, mdev->recv_cnt) || - nla_put_u64(skb, T_read_cnt, mdev->read_cnt) || - nla_put_u64(skb, T_writ_cnt, mdev->writ_cnt) || - nla_put_u64(skb, T_al_writ_cnt, mdev->al_writ_cnt) || - nla_put_u64(skb, T_bm_writ_cnt, mdev->bm_writ_cnt) || - nla_put_u32(skb, T_ap_bio_cnt, atomic_read(&mdev->ap_bio_cnt)) || - nla_put_u32(skb, T_ap_pending_cnt, atomic_read(&mdev->ap_pending_cnt)) || - nla_put_u32(skb, T_rs_pending_cnt, atomic_read(&mdev->rs_pending_cnt))) - goto nla_put_failure; - - if (got_ldev) { - int err; - - spin_lock_irq(&mdev->ldev->md.uuid_lock); - err = nla_put(skb, T_uuids, sizeof(si->uuids), mdev->ldev->md.uuid); - spin_unlock_irq(&mdev->ldev->md.uuid_lock); - - if (err) - goto nla_put_failure; - - if (nla_put_u32(skb, T_disk_flags, mdev->ldev->md.flags) || - nla_put_u64(skb, T_bits_total, drbd_bm_bits(mdev)) || - nla_put_u64(skb, T_bits_oos, drbd_bm_total_weight(mdev))) - goto nla_put_failure; - if (C_SYNC_SOURCE <= mdev->state.conn && - C_PAUSED_SYNC_T >= mdev->state.conn) { - if (nla_put_u64(skb, T_bits_rs_total, mdev->rs_total) || - nla_put_u64(skb, T_bits_rs_failed, mdev->rs_failed)) - goto nla_put_failure; - } - } + unsigned short *tl; - if (sib) { - switch(sib->sib_reason) { - case SIB_SYNC_PROGRESS: - case SIB_GET_STATUS_REPLY: - break; - case SIB_STATE_CHANGE: - if (nla_put_u32(skb, T_prev_state, sib->os.i) || - nla_put_u32(skb, T_new_state, sib->ns.i)) - goto nla_put_failure; - break; - case SIB_HELPER_POST: - if (nla_put_u32(skb, T_helper_exit_code, - sib->helper_exit_code)) - goto nla_put_failure; - /* fall through */ - case SIB_HELPER_PRE: - if (nla_put_string(skb, T_helper, sib->helper_name)) - goto nla_put_failure; - break; - } - } - nla_nest_end(skb, nla); + tl = reply->tag_list; - if (0) -nla_put_failure: - err = -EMSGSIZE; - if (got_ldev) + if (get_ldev(mdev)) { + tl = disk_conf_to_tags(mdev, &mdev->ldev->dc, tl); put_ldev(mdev); - return err; -} + } -int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info) -{ - enum drbd_ret_code retcode; - int err; + if (get_net_conf(mdev)) { + tl = net_conf_to_tags(mdev, mdev->net_conf, tl); + put_net_conf(mdev); + } + tl = syncer_conf_to_tags(mdev, &mdev->sync_conf, tl); - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); - if (!adm_ctx.reply_skb) - return retcode; - if (retcode != NO_ERROR) - goto out; + put_unaligned(TT_END, tl++); /* Close the tag list */ - err = nla_put_status_info(adm_ctx.reply_skb, adm_ctx.mdev, NULL); - if (err) { - nlmsg_free(adm_ctx.reply_skb); - return err; - } -out: - drbd_adm_finish(info, retcode); - return 0; + return (int)((char *)tl - (char *)reply->tag_list); } -int get_one_status(struct sk_buff *skb, struct netlink_callback *cb) +static int drbd_nl_get_state(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) { - struct drbd_conf *mdev; - struct drbd_genlmsghdr *dh; - struct drbd_tconn *pos = (struct drbd_tconn*)cb->args[0]; - struct drbd_tconn *tconn = NULL; - struct drbd_tconn *tmp; - unsigned volume = cb->args[1]; - - /* Open coded, deferred, iteration: - * list_for_each_entry_safe(tconn, tmp, &drbd_tconns, all_tconn) { - * idr_for_each_entry(&tconn->volumes, mdev, i) { - * ... - * } - * } - * where tconn is cb->args[0]; - * and i is cb->args[1]; - * - * cb->args[2] indicates if we shall loop over all resources, - * or just dump all volumes of a single resource. - * - * This may miss entries inserted after this dump started, - * or entries deleted before they are reached. - * - * We need to make sure the mdev won't disappear while - * we are looking at it, and revalidate our iterators - * on each iteration. - */ + unsigned short *tl = reply->tag_list; + union drbd_state s = mdev->state; + unsigned long rs_left; + unsigned int res; - /* synchronize with conn_create()/conn_destroy() */ - rcu_read_lock(); - /* revalidate iterator position */ - list_for_each_entry_rcu(tmp, &drbd_tconns, all_tconn) { - if (pos == NULL) { - /* first iteration */ - pos = tmp; - tconn = pos; - break; - } - if (tmp == pos) { - tconn = pos; - break; + tl = get_state_to_tags(mdev, (struct get_state *)&s, tl); + + /* no local ref, no bitmap, no syncer progress. */ + if (s.conn >= C_SYNC_SOURCE && s.conn <= C_PAUSED_SYNC_T) { + if (get_ldev(mdev)) { + drbd_get_syncer_progress(mdev, &rs_left, &res); + tl = tl_add_int(tl, T_sync_progress, &res); + put_ldev(mdev); } } - if (tconn) { -next_tconn: - mdev = idr_get_next(&tconn->volumes, &volume); - if (!mdev) { - /* No more volumes to dump on this tconn. - * Advance tconn iterator. */ - pos = list_entry_rcu(tconn->all_tconn.next, - struct drbd_tconn, all_tconn); - /* Did we dump any volume on this tconn yet? */ - if (volume != 0) { - /* If we reached the end of the list, - * or only a single resource dump was requested, - * we are done. */ - if (&pos->all_tconn == &drbd_tconns || cb->args[2]) - goto out; - volume = 0; - tconn = pos; - goto next_tconn; - } - } - - dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, &drbd_genl_family, - NLM_F_MULTI, DRBD_ADM_GET_STATUS); - if (!dh) - goto out; + put_unaligned(TT_END, tl++); /* Close the tag list */ - if (!mdev) { - /* This is a tconn without a single volume. - * Suprisingly enough, it may have a network - * configuration. */ - struct net_conf *nc; - dh->minor = -1U; - dh->ret_code = NO_ERROR; - if (nla_put_drbd_cfg_context(skb, tconn, VOLUME_UNSPECIFIED)) - goto cancel; - nc = rcu_dereference(tconn->net_conf); - if (nc && net_conf_to_skb(skb, nc, 1) != 0) - goto cancel; - goto done; - } + return (int)((char *)tl - (char *)reply->tag_list); +} - D_ASSERT(mdev->vnr == volume); - D_ASSERT(mdev->tconn == tconn); +static int drbd_nl_get_uuids(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + unsigned short *tl; - dh->minor = mdev_to_minor(mdev); - dh->ret_code = NO_ERROR; + tl = reply->tag_list; - if (nla_put_status_info(skb, mdev, NULL)) { -cancel: - genlmsg_cancel(skb, dh); - goto out; - } -done: - genlmsg_end(skb, dh); - } + if (get_ldev(mdev)) { + tl = tl_add_blob(tl, T_uuids, mdev->ldev->md.uuid, UI_SIZE*sizeof(u64)); + tl = tl_add_int(tl, T_uuids_flags, &mdev->ldev->md.flags); + put_ldev(mdev); + } + put_unaligned(TT_END, tl++); /* Close the tag list */ -out: - rcu_read_unlock(); - /* where to start the next iteration */ - cb->args[0] = (long)pos; - cb->args[1] = (pos == tconn) ? volume + 1 : 0; - - /* No more tconns/volumes/minors found results in an empty skb. - * Which will terminate the dump. */ - return skb->len; + return (int)((char *)tl - (char *)reply->tag_list); } -/* - * Request status of all resources, or of all volumes within a single resource. - * - * This is a dump, as the answer may not fit in a single reply skb otherwise. - * Which means we cannot use the family->attrbuf or other such members, because - * dump is NOT protected by the genl_lock(). During dump, we only have access - * to the incoming skb, and need to opencode "parsing" of the nlattr payload. - * - * Once things are setup properly, we call into get_one_status(). +/** + * drbd_nl_get_timeout_flag() - Used by drbdsetup to find out which timeout value to use + * @mdev: DRBD device. + * @nlp: Netlink/connector packet from drbdsetup + * @reply: Reply packet for drbdsetup */ -int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb) +static int drbd_nl_get_timeout_flag(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) { - const unsigned hdrlen = GENL_HDRLEN + GENL_MAGIC_FAMILY_HDRSZ; - struct nlattr *nla; - const char *resource_name; - struct drbd_tconn *tconn; - int maxtype; - - /* Is this a followup call? */ - if (cb->args[0]) { - /* ... of a single resource dump, - * and the resource iterator has been advanced already? */ - if (cb->args[2] && cb->args[2] != cb->args[0]) - return 0; /* DONE. */ - goto dump; - } - - /* First call (from netlink_dump_start). We need to figure out - * which resource(s) the user wants us to dump. */ - nla = nla_find(nlmsg_attrdata(cb->nlh, hdrlen), - nlmsg_attrlen(cb->nlh, hdrlen), - DRBD_NLA_CFG_CONTEXT); - - /* No explicit context given. Dump all. */ - if (!nla) - goto dump; - maxtype = ARRAY_SIZE(drbd_cfg_context_nl_policy) - 1; - nla = drbd_nla_find_nested(maxtype, nla, __nla_type(T_ctx_resource_name)); - if (IS_ERR(nla)) - return PTR_ERR(nla); - /* context given, but no name present? */ - if (!nla) - return -EINVAL; - resource_name = nla_data(nla); - tconn = conn_get_by_name(resource_name); - - if (!tconn) - return -ENODEV; - - kref_put(&tconn->kref, &conn_destroy); /* get_one_status() (re)validates tconn by itself */ - - /* prime iterators, and set "filter" mode mark: - * only dump this tconn. */ - cb->args[0] = (long)tconn; - /* cb->args[1] = 0; passed in this way. */ - cb->args[2] = (long)tconn; - -dump: - return get_one_status(skb, cb); -} + unsigned short *tl; + char rv; -int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info) -{ - enum drbd_ret_code retcode; - struct timeout_parms tp; - int err; + tl = reply->tag_list; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); - if (!adm_ctx.reply_skb) - return retcode; - if (retcode != NO_ERROR) - goto out; + rv = mdev->state.pdsk == D_OUTDATED ? UT_PEER_OUTDATED : + test_bit(USE_DEGR_WFC_T, &mdev->flags) ? UT_DEGRADED : UT_DEFAULT; - tp.timeout_type = - adm_ctx.mdev->state.pdsk == D_OUTDATED ? UT_PEER_OUTDATED : - test_bit(USE_DEGR_WFC_T, &adm_ctx.mdev->flags) ? UT_DEGRADED : - UT_DEFAULT; + tl = tl_add_blob(tl, T_use_degraded, &rv, sizeof(rv)); + put_unaligned(TT_END, tl++); /* Close the tag list */ - err = timeout_parms_to_priv_skb(adm_ctx.reply_skb, &tp); - if (err) { - nlmsg_free(adm_ctx.reply_skb); - return err; - } -out: - drbd_adm_finish(info, retcode); - return 0; + return (int)((char *)tl - (char *)reply->tag_list); } -int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info) +static int drbd_nl_start_ov(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) { - struct drbd_conf *mdev; - enum drbd_ret_code retcode; - struct start_ov_parms parms; - - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); - if (!adm_ctx.reply_skb) - return retcode; - if (retcode != NO_ERROR) - goto out; - - mdev = adm_ctx.mdev; + /* default to resume from last known position, if possible */ + struct start_ov args = + { .start_sector = mdev->ov_start_sector }; - /* resume from last known position, if possible */ - parms.ov_start_sector = mdev->ov_start_sector; - parms.ov_stop_sector = ULLONG_MAX; - if (info->attrs[DRBD_NLA_START_OV_PARMS]) { - int err = start_ov_parms_from_attrs(&parms, info); - if (err) { - retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); - goto out; - } + if (!start_ov_from_tags(mdev, nlp->tag_list, &args)) { + reply->ret_code = ERR_MANDATORY_TAG; + return 0; } - /* w_make_ov_request expects position to be aligned */ - mdev->ov_start_sector = parms.ov_start_sector & ~(BM_SECT_PER_BIT-1); - mdev->ov_stop_sector = parms.ov_stop_sector; /* If there is still bitmap IO pending, e.g. previous resync or verify * just being finished, wait for it before requesting a new resync. */ drbd_suspend_io(mdev); wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); - retcode = drbd_request_state(mdev,NS(conn,C_VERIFY_S)); + + /* w_make_ov_request expects position to be aligned */ + mdev->ov_start_sector = args.start_sector & ~BM_SECT_PER_BIT; + reply->ret_code = drbd_request_state(mdev,NS(conn,C_VERIFY_S)); drbd_resume_io(mdev); -out: - drbd_adm_finish(info, retcode); return 0; } -int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info) +static int drbd_nl_new_c_uuid(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) { - struct drbd_conf *mdev; - enum drbd_ret_code retcode; + int retcode = NO_ERROR; int skip_initial_sync = 0; int err; - struct new_c_uuid_parms args; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); - if (!adm_ctx.reply_skb) - return retcode; - if (retcode != NO_ERROR) - goto out_nolock; + struct new_c_uuid args; - mdev = adm_ctx.mdev; - memset(&args, 0, sizeof(args)); - if (info->attrs[DRBD_NLA_NEW_C_UUID_PARMS]) { - err = new_c_uuid_parms_from_attrs(&args, info); - if (err) { - retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); - goto out_nolock; - } + memset(&args, 0, sizeof(struct new_c_uuid)); + if (!new_c_uuid_from_tags(mdev, nlp->tag_list, &args)) { + reply->ret_code = ERR_MANDATORY_TAG; + return 0; } - mutex_lock(mdev->state_mutex); /* Protects us against serialized state changes. */ + mutex_lock(&mdev->state_mutex); /* Protects us against serialized state changes. */ if (!get_ldev(mdev)) { retcode = ERR_NO_DISK; @@ -3018,7 +2250,7 @@ int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info) } /* this is "skip initial sync", assume to be clean */ - if (mdev->state.conn == C_CONNECTED && mdev->tconn->agreed_pro_version >= 90 && + if (mdev->state.conn == C_CONNECTED && mdev->agreed_pro_version >= 90 && mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && args.clear_bm) { dev_info(DEV, "Preparing to skip initial sync\n"); skip_initial_sync = 1; @@ -3041,10 +2273,10 @@ int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info) drbd_send_uuids_skip_initial_sync(mdev); _drbd_uuid_set(mdev, UI_BITMAP, 0); drbd_print_uuids(mdev, "cleared bitmap UUID"); - spin_lock_irq(&mdev->tconn->req_lock); + spin_lock_irq(&mdev->req_lock); _drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE), CS_VERBOSE, NULL); - spin_unlock_irq(&mdev->tconn->req_lock); + spin_unlock_irq(&mdev->req_lock); } } @@ -3052,284 +2284,416 @@ int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info) out_dec: put_ldev(mdev); out: - mutex_unlock(mdev->state_mutex); -out_nolock: - drbd_adm_finish(info, retcode); + mutex_unlock(&mdev->state_mutex); + + reply->ret_code = retcode; return 0; } -static enum drbd_ret_code -drbd_check_resource_name(const char *name) +struct cn_handler_struct { + int (*function)(struct drbd_conf *, + struct drbd_nl_cfg_req *, + struct drbd_nl_cfg_reply *); + int reply_body_size; +}; + +static struct cn_handler_struct cnd_table[] = { + [ P_primary ] = { &drbd_nl_primary, 0 }, + [ P_secondary ] = { &drbd_nl_secondary, 0 }, + [ P_disk_conf ] = { &drbd_nl_disk_conf, 0 }, + [ P_detach ] = { &drbd_nl_detach, 0 }, + [ P_net_conf ] = { &drbd_nl_net_conf, 0 }, + [ P_disconnect ] = { &drbd_nl_disconnect, 0 }, + [ P_resize ] = { &drbd_nl_resize, 0 }, + [ P_syncer_conf ] = { &drbd_nl_syncer_conf, 0 }, + [ P_invalidate ] = { &drbd_nl_invalidate, 0 }, + [ P_invalidate_peer ] = { &drbd_nl_invalidate_peer, 0 }, + [ P_pause_sync ] = { &drbd_nl_pause_sync, 0 }, + [ P_resume_sync ] = { &drbd_nl_resume_sync, 0 }, + [ P_suspend_io ] = { &drbd_nl_suspend_io, 0 }, + [ P_resume_io ] = { &drbd_nl_resume_io, 0 }, + [ P_outdate ] = { &drbd_nl_outdate, 0 }, + [ P_get_config ] = { &drbd_nl_get_config, + sizeof(struct syncer_conf_tag_len_struct) + + sizeof(struct disk_conf_tag_len_struct) + + sizeof(struct net_conf_tag_len_struct) }, + [ P_get_state ] = { &drbd_nl_get_state, + sizeof(struct get_state_tag_len_struct) + + sizeof(struct sync_progress_tag_len_struct) }, + [ P_get_uuids ] = { &drbd_nl_get_uuids, + sizeof(struct get_uuids_tag_len_struct) }, + [ P_get_timeout_flag ] = { &drbd_nl_get_timeout_flag, + sizeof(struct get_timeout_flag_tag_len_struct)}, + [ P_start_ov ] = { &drbd_nl_start_ov, 0 }, + [ P_new_c_uuid ] = { &drbd_nl_new_c_uuid, 0 }, +}; + +static void drbd_connector_callback(struct cn_msg *req, struct netlink_skb_parms *nsp) { - if (!name || !name[0]) { - drbd_msg_put_info("resource name missing"); - return ERR_MANDATORY_TAG; - } - /* if we want to use these in sysfs/configfs/debugfs some day, - * we must not allow slashes */ - if (strchr(name, '/')) { - drbd_msg_put_info("invalid resource name"); - return ERR_INVALID_REQUEST; + struct drbd_nl_cfg_req *nlp = (struct drbd_nl_cfg_req *)req->data; + struct cn_handler_struct *cm; + struct cn_msg *cn_reply; + struct drbd_nl_cfg_reply *reply; + struct drbd_conf *mdev; + int retcode, rr; + int reply_size = sizeof(struct cn_msg) + + sizeof(struct drbd_nl_cfg_reply) + + sizeof(short int); + + if (!try_module_get(THIS_MODULE)) { + printk(KERN_ERR "drbd: try_module_get() failed!\n"); + return; } - return NO_ERROR; -} -int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info) -{ - enum drbd_ret_code retcode; - struct res_opts res_opts; - int err; + if (!capable(CAP_SYS_ADMIN)) { + retcode = ERR_PERM; + goto fail; + } - retcode = drbd_adm_prepare(skb, info, 0); - if (!adm_ctx.reply_skb) - return retcode; - if (retcode != NO_ERROR) - goto out; + mdev = ensure_mdev(nlp->drbd_minor, + (nlp->flags & DRBD_NL_CREATE_DEVICE)); + if (!mdev) { + retcode = ERR_MINOR_INVALID; + goto fail; + } - set_res_opts_defaults(&res_opts); - err = res_opts_from_attrs(&res_opts, info); - if (err && err != -ENOMSG) { - retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); - goto out; + if (nlp->packet_type >= P_nl_after_last_packet || + nlp->packet_type == P_return_code_only) { + retcode = ERR_PACKET_NR; + goto fail; } - retcode = drbd_check_resource_name(adm_ctx.resource_name); - if (retcode != NO_ERROR) - goto out; + cm = cnd_table + nlp->packet_type; - if (adm_ctx.tconn) { - if (info->nlhdr->nlmsg_flags & NLM_F_EXCL) { - retcode = ERR_INVALID_REQUEST; - drbd_msg_put_info("resource exists"); - } - /* else: still NO_ERROR */ - goto out; + /* This may happen if packet number is 0: */ + if (cm->function == NULL) { + retcode = ERR_PACKET_NR; + goto fail; } - if (!conn_create(adm_ctx.resource_name, &res_opts)) + reply_size += cm->reply_body_size; + + /* allocation not in the IO path, cqueue thread context */ + cn_reply = kzalloc(reply_size, GFP_KERNEL); + if (!cn_reply) { retcode = ERR_NOMEM; -out: - drbd_adm_finish(info, retcode); - return 0; + goto fail; + } + reply = (struct drbd_nl_cfg_reply *) cn_reply->data; + + reply->packet_type = + cm->reply_body_size ? nlp->packet_type : P_return_code_only; + reply->minor = nlp->drbd_minor; + reply->ret_code = NO_ERROR; /* Might by modified by cm->function. */ + /* reply->tag_list; might be modified by cm->function. */ + + rr = cm->function(mdev, nlp, reply); + + cn_reply->id = req->id; + cn_reply->seq = req->seq; + cn_reply->ack = req->ack + 1; + cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + rr; + cn_reply->flags = 0; + + rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_KERNEL); + if (rr && rr != -ESRCH) + printk(KERN_INFO "drbd: cn_netlink_send()=%d\n", rr); + + kfree(cn_reply); + module_put(THIS_MODULE); + return; + fail: + drbd_nl_send_reply(req, retcode); + module_put(THIS_MODULE); } -int drbd_adm_add_minor(struct sk_buff *skb, struct genl_info *info) +static atomic_t drbd_nl_seq = ATOMIC_INIT(2); /* two. */ + +static unsigned short * +__tl_add_blob(unsigned short *tl, enum drbd_tags tag, const void *data, + unsigned short len, int nul_terminated) { - struct drbd_genlmsghdr *dh = info->userhdr; - enum drbd_ret_code retcode; + unsigned short l = tag_descriptions[tag_number(tag)].max_len; + len = (len < l) ? len : l; + put_unaligned(tag, tl++); + put_unaligned(len, tl++); + memcpy(tl, data, len); + tl = (unsigned short*)((char*)tl + len); + if (nul_terminated) + *((char*)tl - 1) = 0; + return tl; +} - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE); - if (!adm_ctx.reply_skb) - return retcode; - if (retcode != NO_ERROR) - goto out; +static unsigned short * +tl_add_blob(unsigned short *tl, enum drbd_tags tag, const void *data, int len) +{ + return __tl_add_blob(tl, tag, data, len, 0); +} - if (dh->minor > MINORMASK) { - drbd_msg_put_info("requested minor out of range"); - retcode = ERR_INVALID_REQUEST; - goto out; - } - if (adm_ctx.volume > DRBD_VOLUME_MAX) { - drbd_msg_put_info("requested volume id out of range"); - retcode = ERR_INVALID_REQUEST; - goto out; - } +static unsigned short * +tl_add_str(unsigned short *tl, enum drbd_tags tag, const char *str) +{ + return __tl_add_blob(tl, tag, str, strlen(str)+1, 0); +} - /* drbd_adm_prepare made sure already - * that mdev->tconn and mdev->vnr match the request. */ - if (adm_ctx.mdev) { - if (info->nlhdr->nlmsg_flags & NLM_F_EXCL) - retcode = ERR_MINOR_EXISTS; - /* else: still NO_ERROR */ - goto out; +static unsigned short * +tl_add_int(unsigned short *tl, enum drbd_tags tag, const void *val) +{ + put_unaligned(tag, tl++); + switch(tag_type(tag)) { + case TT_INTEGER: + put_unaligned(sizeof(int), tl++); + put_unaligned(*(int *)val, (int *)tl); + tl = (unsigned short*)((char*)tl+sizeof(int)); + break; + case TT_INT64: + put_unaligned(sizeof(u64), tl++); + put_unaligned(*(u64 *)val, (u64 *)tl); + tl = (unsigned short*)((char*)tl+sizeof(u64)); + break; + default: + /* someone did something stupid. */ + ; } - - retcode = conn_new_minor(adm_ctx.tconn, dh->minor, adm_ctx.volume); -out: - drbd_adm_finish(info, retcode); - return 0; + return tl; } -static enum drbd_ret_code adm_delete_minor(struct drbd_conf *mdev) +void drbd_bcast_state(struct drbd_conf *mdev, union drbd_state state) { - if (mdev->state.disk == D_DISKLESS && - /* no need to be mdev->state.conn == C_STANDALONE && - * we may want to delete a minor from a live replication group. - */ - mdev->state.role == R_SECONDARY) { - _drbd_request_state(mdev, NS(conn, C_WF_REPORT_PARAMS), - CS_VERBOSE + CS_WAIT_COMPLETE); - idr_remove(&mdev->tconn->volumes, mdev->vnr); - idr_remove(&minors, mdev_to_minor(mdev)); - del_gendisk(mdev->vdisk); - synchronize_rcu(); - kref_put(&mdev->kref, &drbd_minor_destroy); - return NO_ERROR; - } else - return ERR_MINOR_CONFIGURED; + char buffer[sizeof(struct cn_msg)+ + sizeof(struct drbd_nl_cfg_reply)+ + sizeof(struct get_state_tag_len_struct)+ + sizeof(short int)]; + struct cn_msg *cn_reply = (struct cn_msg *) buffer; + struct drbd_nl_cfg_reply *reply = + (struct drbd_nl_cfg_reply *)cn_reply->data; + unsigned short *tl = reply->tag_list; + + /* dev_warn(DEV, "drbd_bcast_state() got called\n"); */ + + tl = get_state_to_tags(mdev, (struct get_state *)&state, tl); + + put_unaligned(TT_END, tl++); /* Close the tag list */ + + cn_reply->id.idx = CN_IDX_DRBD; + cn_reply->id.val = CN_VAL_DRBD; + + cn_reply->seq = atomic_add_return(1, &drbd_nl_seq); + cn_reply->ack = 0; /* not used here. */ + cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + + (int)((char *)tl - (char *)reply->tag_list); + cn_reply->flags = 0; + + reply->packet_type = P_get_state; + reply->minor = mdev_to_minor(mdev); + reply->ret_code = NO_ERROR; + + cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); } -int drbd_adm_delete_minor(struct sk_buff *skb, struct genl_info *info) +void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name) { - enum drbd_ret_code retcode; + char buffer[sizeof(struct cn_msg)+ + sizeof(struct drbd_nl_cfg_reply)+ + sizeof(struct call_helper_tag_len_struct)+ + sizeof(short int)]; + struct cn_msg *cn_reply = (struct cn_msg *) buffer; + struct drbd_nl_cfg_reply *reply = + (struct drbd_nl_cfg_reply *)cn_reply->data; + unsigned short *tl = reply->tag_list; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); - if (!adm_ctx.reply_skb) - return retcode; - if (retcode != NO_ERROR) - goto out; + /* dev_warn(DEV, "drbd_bcast_state() got called\n"); */ - retcode = adm_delete_minor(adm_ctx.mdev); -out: - drbd_adm_finish(info, retcode); - return 0; + tl = tl_add_str(tl, T_helper, helper_name); + put_unaligned(TT_END, tl++); /* Close the tag list */ + + cn_reply->id.idx = CN_IDX_DRBD; + cn_reply->id.val = CN_VAL_DRBD; + + cn_reply->seq = atomic_add_return(1, &drbd_nl_seq); + cn_reply->ack = 0; /* not used here. */ + cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + + (int)((char *)tl - (char *)reply->tag_list); + cn_reply->flags = 0; + + reply->packet_type = P_call_helper; + reply->minor = mdev_to_minor(mdev); + reply->ret_code = NO_ERROR; + + cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); } -int drbd_adm_down(struct sk_buff *skb, struct genl_info *info) +void drbd_bcast_ee(struct drbd_conf *mdev, + const char *reason, const int dgs, + const char* seen_hash, const char* calc_hash, + const struct drbd_epoch_entry* e) { - int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */ - struct drbd_conf *mdev; - unsigned i; + struct cn_msg *cn_reply; + struct drbd_nl_cfg_reply *reply; + unsigned short *tl; + struct page *page; + unsigned len; - retcode = drbd_adm_prepare(skb, info, 0); - if (!adm_ctx.reply_skb) - return retcode; - if (retcode != NO_ERROR) - goto out; + if (!e) + return; + if (!reason || !reason[0]) + return; - if (!adm_ctx.tconn) { - retcode = ERR_RES_NOT_KNOWN; - goto out; + /* apparently we have to memcpy twice, first to prepare the data for the + * struct cn_msg, then within cn_netlink_send from the cn_msg to the + * netlink skb. */ + /* receiver thread context, which is not in the writeout path (of this node), + * but may be in the writeout path of the _other_ node. + * GFP_NOIO to avoid potential "distributed deadlock". */ + cn_reply = kzalloc( + sizeof(struct cn_msg)+ + sizeof(struct drbd_nl_cfg_reply)+ + sizeof(struct dump_ee_tag_len_struct)+ + sizeof(short int), + GFP_NOIO); + + if (!cn_reply) { + dev_err(DEV, "could not kmalloc buffer for drbd_bcast_ee, sector %llu, size %u\n", + (unsigned long long)e->sector, e->size); + return; } - /* demote */ - idr_for_each_entry(&adm_ctx.tconn->volumes, mdev, i) { - retcode = drbd_set_role(mdev, R_SECONDARY, 0); - if (retcode < SS_SUCCESS) { - drbd_msg_put_info("failed to demote"); - goto out; - } + reply = (struct drbd_nl_cfg_reply*)cn_reply->data; + tl = reply->tag_list; + + tl = tl_add_str(tl, T_dump_ee_reason, reason); + tl = tl_add_blob(tl, T_seen_digest, seen_hash, dgs); + tl = tl_add_blob(tl, T_calc_digest, calc_hash, dgs); + tl = tl_add_int(tl, T_ee_sector, &e->sector); + tl = tl_add_int(tl, T_ee_block_id, &e->block_id); + + /* dump the first 32k */ + len = min_t(unsigned, e->size, 32 << 10); + put_unaligned(T_ee_data, tl++); + put_unaligned(len, tl++); + + page = e->pages; + page_chain_for_each(page) { + void *d = kmap_atomic(page); + unsigned l = min_t(unsigned, len, PAGE_SIZE); + memcpy(tl, d, l); + kunmap_atomic(d); + tl = (unsigned short*)((char*)tl + l); + len -= l; + if (len == 0) + break; } + put_unaligned(TT_END, tl++); /* Close the tag list */ - retcode = conn_try_disconnect(adm_ctx.tconn, 0); - if (retcode < SS_SUCCESS) { - drbd_msg_put_info("failed to disconnect"); - goto out; - } + cn_reply->id.idx = CN_IDX_DRBD; + cn_reply->id.val = CN_VAL_DRBD; - /* detach */ - idr_for_each_entry(&adm_ctx.tconn->volumes, mdev, i) { - retcode = adm_detach(mdev, 0); - if (retcode < SS_SUCCESS || retcode > NO_ERROR) { - drbd_msg_put_info("failed to detach"); - goto out; - } - } + cn_reply->seq = atomic_add_return(1,&drbd_nl_seq); + cn_reply->ack = 0; // not used here. + cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + + (int)((char*)tl - (char*)reply->tag_list); + cn_reply->flags = 0; - /* If we reach this, all volumes (of this tconn) are Secondary, - * Disconnected, Diskless, aka Unconfigured. Make sure all threads have - * actually stopped, state handling only does drbd_thread_stop_nowait(). */ - drbd_thread_stop(&adm_ctx.tconn->worker); + reply->packet_type = P_dump_ee; + reply->minor = mdev_to_minor(mdev); + reply->ret_code = NO_ERROR; - /* Now, nothing can fail anymore */ + cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); + kfree(cn_reply); +} - /* delete volumes */ - idr_for_each_entry(&adm_ctx.tconn->volumes, mdev, i) { - retcode = adm_delete_minor(mdev); - if (retcode != NO_ERROR) { - /* "can not happen" */ - drbd_msg_put_info("failed to delete volume"); - goto out; - } - } +void drbd_bcast_sync_progress(struct drbd_conf *mdev) +{ + char buffer[sizeof(struct cn_msg)+ + sizeof(struct drbd_nl_cfg_reply)+ + sizeof(struct sync_progress_tag_len_struct)+ + sizeof(short int)]; + struct cn_msg *cn_reply = (struct cn_msg *) buffer; + struct drbd_nl_cfg_reply *reply = + (struct drbd_nl_cfg_reply *)cn_reply->data; + unsigned short *tl = reply->tag_list; + unsigned long rs_left; + unsigned int res; - /* delete connection */ - if (conn_lowest_minor(adm_ctx.tconn) < 0) { - list_del_rcu(&adm_ctx.tconn->all_tconn); - synchronize_rcu(); - kref_put(&adm_ctx.tconn->kref, &conn_destroy); + /* no local ref, no bitmap, no syncer progress, no broadcast. */ + if (!get_ldev(mdev)) + return; + drbd_get_syncer_progress(mdev, &rs_left, &res); + put_ldev(mdev); - retcode = NO_ERROR; - } else { - /* "can not happen" */ - retcode = ERR_RES_IN_USE; - drbd_msg_put_info("failed to delete connection"); - } - goto out; -out: - drbd_adm_finish(info, retcode); - return 0; + tl = tl_add_int(tl, T_sync_progress, &res); + put_unaligned(TT_END, tl++); /* Close the tag list */ + + cn_reply->id.idx = CN_IDX_DRBD; + cn_reply->id.val = CN_VAL_DRBD; + + cn_reply->seq = atomic_add_return(1, &drbd_nl_seq); + cn_reply->ack = 0; /* not used here. */ + cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + + (int)((char *)tl - (char *)reply->tag_list); + cn_reply->flags = 0; + + reply->packet_type = P_sync_progress; + reply->minor = mdev_to_minor(mdev); + reply->ret_code = NO_ERROR; + + cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); } -int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info) +int __init drbd_nl_init(void) { - enum drbd_ret_code retcode; - - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE); - if (!adm_ctx.reply_skb) - return retcode; - if (retcode != NO_ERROR) - goto out; + static struct cb_id cn_id_drbd; + int err, try=10; - if (conn_lowest_minor(adm_ctx.tconn) < 0) { - list_del_rcu(&adm_ctx.tconn->all_tconn); - synchronize_rcu(); - kref_put(&adm_ctx.tconn->kref, &conn_destroy); + cn_id_drbd.val = CN_VAL_DRBD; + do { + cn_id_drbd.idx = cn_idx; + err = cn_add_callback(&cn_id_drbd, "cn_drbd", &drbd_connector_callback); + if (!err) + break; + cn_idx = (cn_idx + CN_IDX_STEP); + } while (try--); - retcode = NO_ERROR; - } else { - retcode = ERR_RES_IN_USE; + if (err) { + printk(KERN_ERR "drbd: cn_drbd failed to register\n"); + return err; } - if (retcode == NO_ERROR) - drbd_thread_stop(&adm_ctx.tconn->worker); -out: - drbd_adm_finish(info, retcode); return 0; } -void drbd_bcast_event(struct drbd_conf *mdev, const struct sib_info *sib) +void drbd_nl_cleanup(void) { - static atomic_t drbd_genl_seq = ATOMIC_INIT(2); /* two. */ - struct sk_buff *msg; - struct drbd_genlmsghdr *d_out; - unsigned seq; - int err = -ENOMEM; - - if (sib->sib_reason == SIB_SYNC_PROGRESS) { - if (time_after(jiffies, mdev->rs_last_bcast + HZ)) - mdev->rs_last_bcast = jiffies; - else - return; - } + static struct cb_id cn_id_drbd; - seq = atomic_inc_return(&drbd_genl_seq); - msg = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO); - if (!msg) - goto failed; + cn_id_drbd.idx = cn_idx; + cn_id_drbd.val = CN_VAL_DRBD; - err = -EMSGSIZE; - d_out = genlmsg_put(msg, 0, seq, &drbd_genl_family, 0, DRBD_EVENT); - if (!d_out) /* cannot happen, but anyways. */ - goto nla_put_failure; - d_out->minor = mdev_to_minor(mdev); - d_out->ret_code = NO_ERROR; + cn_del_callback(&cn_id_drbd); +} - if (nla_put_status_info(msg, mdev, sib)) - goto nla_put_failure; - genlmsg_end(msg, d_out); - err = drbd_genl_multicast_events(msg, 0); - /* msg has been consumed or freed in netlink_broadcast() */ - if (err && err != -ESRCH) - goto failed; +void drbd_nl_send_reply(struct cn_msg *req, int ret_code) +{ + char buffer[sizeof(struct cn_msg)+sizeof(struct drbd_nl_cfg_reply)]; + struct cn_msg *cn_reply = (struct cn_msg *) buffer; + struct drbd_nl_cfg_reply *reply = + (struct drbd_nl_cfg_reply *)cn_reply->data; + int rr; - return; + memset(buffer, 0, sizeof(buffer)); + cn_reply->id = req->id; -nla_put_failure: - nlmsg_free(msg); -failed: - dev_err(DEV, "Error %d while broadcasting event. " - "Event seq:%u sib_reason:%u\n", - err, seq, sib->sib_reason); + cn_reply->seq = req->seq; + cn_reply->ack = req->ack + 1; + cn_reply->len = sizeof(struct drbd_nl_cfg_reply); + cn_reply->flags = 0; + + reply->packet_type = P_return_code_only; + reply->minor = ((struct drbd_nl_cfg_req *)req->data)->drbd_minor; + reply->ret_code = ret_code; + + rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); + if (rr && rr != -ESRCH) + printk(KERN_INFO "drbd: cn_netlink_send()=%d\n", rr); } + diff --git a/trunk/drivers/block/drbd/drbd_nla.c b/trunk/drivers/block/drbd/drbd_nla.c deleted file mode 100644 index fa672b6df8d6..000000000000 --- a/trunk/drivers/block/drbd/drbd_nla.c +++ /dev/null @@ -1,55 +0,0 @@ -#include "drbd_wrappers.h" -#include -#include -#include -#include "drbd_nla.h" - -static int drbd_nla_check_mandatory(int maxtype, struct nlattr *nla) -{ - struct nlattr *head = nla_data(nla); - int len = nla_len(nla); - int rem; - - /* - * validate_nla (called from nla_parse_nested) ignores attributes - * beyond maxtype, and does not understand the DRBD_GENLA_F_MANDATORY flag. - * In order to have it validate attributes with the DRBD_GENLA_F_MANDATORY - * flag set also, check and remove that flag before calling - * nla_parse_nested. - */ - - nla_for_each_attr(nla, head, len, rem) { - if (nla->nla_type & DRBD_GENLA_F_MANDATORY) { - nla->nla_type &= ~DRBD_GENLA_F_MANDATORY; - if (nla_type(nla) > maxtype) - return -EOPNOTSUPP; - } - } - return 0; -} - -int drbd_nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla, - const struct nla_policy *policy) -{ - int err; - - err = drbd_nla_check_mandatory(maxtype, nla); - if (!err) - err = nla_parse_nested(tb, maxtype, nla, policy); - - return err; -} - -struct nlattr *drbd_nla_find_nested(int maxtype, struct nlattr *nla, int attrtype) -{ - int err; - /* - * If any nested attribute has the DRBD_GENLA_F_MANDATORY flag set and - * we don't know about that attribute, reject all the nested - * attributes. - */ - err = drbd_nla_check_mandatory(maxtype, nla); - if (err) - return ERR_PTR(err); - return nla_find_nested(nla, attrtype); -} diff --git a/trunk/drivers/block/drbd/drbd_nla.h b/trunk/drivers/block/drbd/drbd_nla.h deleted file mode 100644 index 679c2d5b4535..000000000000 --- a/trunk/drivers/block/drbd/drbd_nla.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __DRBD_NLA_H -#define __DRBD_NLA_H - -extern int drbd_nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla, - const struct nla_policy *policy); -extern struct nlattr *drbd_nla_find_nested(int maxtype, struct nlattr *nla, int attrtype); - -#endif /* __DRBD_NLA_H */ diff --git a/trunk/drivers/block/drbd/drbd_proc.c b/trunk/drivers/block/drbd/drbd_proc.c index 56672a61eb94..5496104f90b9 100644 --- a/trunk/drivers/block/drbd/drbd_proc.c +++ b/trunk/drivers/block/drbd/drbd_proc.c @@ -167,24 +167,18 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq) * we convert to sectors in the display below. */ unsigned long bm_bits = drbd_bm_bits(mdev); unsigned long bit_pos; - unsigned long long stop_sector = 0; if (mdev->state.conn == C_VERIFY_S || - mdev->state.conn == C_VERIFY_T) { + mdev->state.conn == C_VERIFY_T) bit_pos = bm_bits - mdev->ov_left; - if (verify_can_do_stop_sector(mdev)) - stop_sector = mdev->ov_stop_sector; - } else + else bit_pos = mdev->bm_resync_fo; /* Total sectors may be slightly off for oddly * sized devices. So what. */ seq_printf(seq, - "\t%3d%% sector pos: %llu/%llu", + "\t%3d%% sector pos: %llu/%llu\n", (int)(bit_pos / (bm_bits/100+1)), (unsigned long long)bit_pos * BM_SECT_PER_BIT, (unsigned long long)bm_bits * BM_SECT_PER_BIT); - if (stop_sector != 0 && stop_sector != ULLONG_MAX) - seq_printf(seq, " stop sector: %llu", stop_sector); - seq_printf(seq, "\n"); } } @@ -200,11 +194,9 @@ static void resync_dump_detail(struct seq_file *seq, struct lc_element *e) static int drbd_seq_show(struct seq_file *seq, void *v) { - int i, prev_i = -1; + int i, hole = 0; const char *sn; struct drbd_conf *mdev; - struct net_conf *nc; - char wp; static char write_ordering_chars[] = { [WO_none] = 'n', @@ -235,11 +227,16 @@ static int drbd_seq_show(struct seq_file *seq, void *v) oos .. known out-of-sync kB */ - rcu_read_lock(); - idr_for_each_entry(&minors, mdev, i) { - if (prev_i != i - 1) + for (i = 0; i < minor_count; i++) { + mdev = minor_to_mdev(i); + if (!mdev) { + hole = 1; + continue; + } + if (hole) { + hole = 0; seq_printf(seq, "\n"); - prev_i = i; + } sn = drbd_conn_str(mdev->state.conn); @@ -251,8 +248,6 @@ static int drbd_seq_show(struct seq_file *seq, void *v) /* reset mdev->congestion_reason */ bdi_rw_congested(&mdev->rq_queue->backing_dev_info); - nc = rcu_dereference(mdev->tconn->net_conf); - wp = nc ? nc->wire_protocol - DRBD_PROT_A + 'A' : ' '; seq_printf(seq, "%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c%c\n" " ns:%u nr:%u dw:%u dr:%u al:%u bm:%u " @@ -262,8 +257,9 @@ static int drbd_seq_show(struct seq_file *seq, void *v) drbd_role_str(mdev->state.peer), drbd_disk_str(mdev->state.disk), drbd_disk_str(mdev->state.pdsk), - wp, - drbd_suspended(mdev) ? 's' : 'r', + (mdev->net_conf == NULL ? ' ' : + (mdev->net_conf->wire_protocol - DRBD_PROT_A+'A')), + is_susp(mdev->state) ? 's' : 'r', mdev->state.aftr_isp ? 'a' : '-', mdev->state.peer_isp ? 'p' : '-', mdev->state.user_isp ? 'u' : '-', @@ -280,8 +276,8 @@ static int drbd_seq_show(struct seq_file *seq, void *v) atomic_read(&mdev->rs_pending_cnt), atomic_read(&mdev->unacked_cnt), atomic_read(&mdev->ap_bio_cnt), - mdev->tconn->epochs, - write_ordering_chars[mdev->tconn->write_ordering] + mdev->epochs, + write_ordering_chars[mdev->write_ordering] ); seq_printf(seq, " oos:%llu\n", Bit2KB((unsigned long long) @@ -306,7 +302,6 @@ static int drbd_seq_show(struct seq_file *seq, void *v) } } } - rcu_read_unlock(); return 0; } diff --git a/trunk/drivers/block/drbd/drbd_receiver.c b/trunk/drivers/block/drbd/drbd_receiver.c index a9eccfc6079b..c74ca2df7431 100644 --- a/trunk/drivers/block/drbd/drbd_receiver.c +++ b/trunk/drivers/block/drbd/drbd_receiver.c @@ -48,25 +48,17 @@ #include "drbd_vli.h" -struct packet_info { - enum drbd_packet cmd; - unsigned int size; - unsigned int vnr; - void *data; -}; - enum finish_epoch { FE_STILL_LIVE, FE_DESTROYED, FE_RECYCLED, }; -static int drbd_do_features(struct drbd_tconn *tconn); -static int drbd_do_auth(struct drbd_tconn *tconn); -static int drbd_disconnected(struct drbd_conf *mdev); +static int drbd_do_handshake(struct drbd_conf *mdev); +static int drbd_do_auth(struct drbd_conf *mdev); -static enum finish_epoch drbd_may_finish_epoch(struct drbd_tconn *, struct drbd_epoch *, enum epoch_event); -static int e_end_block(struct drbd_work *, int); +static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *, struct drbd_epoch *, enum epoch_event); +static int e_end_block(struct drbd_conf *, struct drbd_work *, int); #define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN) @@ -150,12 +142,11 @@ static void page_chain_add(struct page **head, *head = chain_first; } -static struct page *__drbd_alloc_pages(struct drbd_conf *mdev, - unsigned int number) +static struct page *drbd_pp_first_pages_or_try_alloc(struct drbd_conf *mdev, int number) { struct page *page = NULL; struct page *tmp = NULL; - unsigned int i = 0; + int i = 0; /* Yes, testing drbd_pp_vacant outside the lock is racy. * So what. It saves a spin_lock. */ @@ -184,7 +175,7 @@ static struct page *__drbd_alloc_pages(struct drbd_conf *mdev, return page; /* Not enough pages immediately available this time. - * No need to jump around here, drbd_alloc_pages will retry this + * No need to jump around here, drbd_pp_alloc will retry this * function "soon". */ if (page) { tmp = page_chain_tail(page, NULL); @@ -196,10 +187,9 @@ static struct page *__drbd_alloc_pages(struct drbd_conf *mdev, return NULL; } -static void reclaim_finished_net_peer_reqs(struct drbd_conf *mdev, - struct list_head *to_be_freed) +static void reclaim_net_ee(struct drbd_conf *mdev, struct list_head *to_be_freed) { - struct drbd_peer_request *peer_req; + struct drbd_epoch_entry *e; struct list_head *le, *tle; /* The EEs are always appended to the end of the list. Since @@ -208,8 +198,8 @@ static void reclaim_finished_net_peer_reqs(struct drbd_conf *mdev, stop to examine the list... */ list_for_each_safe(le, tle, &mdev->net_ee) { - peer_req = list_entry(le, struct drbd_peer_request, w.list); - if (drbd_peer_req_has_active_page(peer_req)) + e = list_entry(le, struct drbd_epoch_entry, w.list); + if (drbd_ee_has_active_page(e)) break; list_move(le, to_be_freed); } @@ -218,18 +208,18 @@ static void reclaim_finished_net_peer_reqs(struct drbd_conf *mdev, static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev) { LIST_HEAD(reclaimed); - struct drbd_peer_request *peer_req, *t; + struct drbd_epoch_entry *e, *t; - spin_lock_irq(&mdev->tconn->req_lock); - reclaim_finished_net_peer_reqs(mdev, &reclaimed); - spin_unlock_irq(&mdev->tconn->req_lock); + spin_lock_irq(&mdev->req_lock); + reclaim_net_ee(mdev, &reclaimed); + spin_unlock_irq(&mdev->req_lock); - list_for_each_entry_safe(peer_req, t, &reclaimed, w.list) - drbd_free_net_peer_req(mdev, peer_req); + list_for_each_entry_safe(e, t, &reclaimed, w.list) + drbd_free_net_ee(mdev, e); } /** - * drbd_alloc_pages() - Returns @number pages, retries forever (or until signalled) + * drbd_pp_alloc() - Returns @number pages, retries forever (or until signalled) * @mdev: DRBD device. * @number: number of pages requested * @retry: whether to retry, if not enough pages are available right now @@ -240,31 +230,23 @@ static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev) * * Returns a page chain linked via page->private. */ -struct page *drbd_alloc_pages(struct drbd_conf *mdev, unsigned int number, - bool retry) +static struct page *drbd_pp_alloc(struct drbd_conf *mdev, unsigned number, bool retry) { struct page *page = NULL; - struct net_conf *nc; DEFINE_WAIT(wait); - int mxb; /* Yes, we may run up to @number over max_buffers. If we * follow it strictly, the admin will get it wrong anyways. */ - rcu_read_lock(); - nc = rcu_dereference(mdev->tconn->net_conf); - mxb = nc ? nc->max_buffers : 1000000; - rcu_read_unlock(); - - if (atomic_read(&mdev->pp_in_use) < mxb) - page = __drbd_alloc_pages(mdev, number); + if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) + page = drbd_pp_first_pages_or_try_alloc(mdev, number); while (page == NULL) { prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE); drbd_kick_lo_and_reclaim_net(mdev); - if (atomic_read(&mdev->pp_in_use) < mxb) { - page = __drbd_alloc_pages(mdev, number); + if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) { + page = drbd_pp_first_pages_or_try_alloc(mdev, number); if (page) break; } @@ -273,7 +255,7 @@ struct page *drbd_alloc_pages(struct drbd_conf *mdev, unsigned int number, break; if (signal_pending(current)) { - dev_warn(DEV, "drbd_alloc_pages interrupted!\n"); + dev_warn(DEV, "drbd_pp_alloc interrupted!\n"); break; } @@ -286,11 +268,11 @@ struct page *drbd_alloc_pages(struct drbd_conf *mdev, unsigned int number, return page; } -/* Must not be used from irq, as that may deadlock: see drbd_alloc_pages. - * Is also used from inside an other spin_lock_irq(&mdev->tconn->req_lock); +/* Must not be used from irq, as that may deadlock: see drbd_pp_alloc. + * Is also used from inside an other spin_lock_irq(&mdev->req_lock); * Either links the page chain back to the global pool, * or returns all pages to the system. */ -static void drbd_free_pages(struct drbd_conf *mdev, struct page *page, int is_net) +static void drbd_pp_free(struct drbd_conf *mdev, struct page *page, int is_net) { atomic_t *a = is_net ? &mdev->pp_in_use_by_net : &mdev->pp_in_use; int i; @@ -298,7 +280,7 @@ static void drbd_free_pages(struct drbd_conf *mdev, struct page *page, int is_ne if (page == NULL) return; - if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count) + if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE)*minor_count) i = page_chain_free(page); else { struct page *tmp; @@ -320,130 +302,127 @@ You need to hold the req_lock: _drbd_wait_ee_list_empty() You must not have the req_lock: - drbd_free_peer_req() - drbd_alloc_peer_req() - drbd_free_peer_reqs() + drbd_free_ee() + drbd_alloc_ee() + drbd_init_ee() + drbd_release_ee() drbd_ee_fix_bhs() - drbd_finish_peer_reqs() + drbd_process_done_ee() drbd_clear_done_ee() drbd_wait_ee_list_empty() */ -struct drbd_peer_request * -drbd_alloc_peer_req(struct drbd_conf *mdev, u64 id, sector_t sector, - unsigned int data_size, gfp_t gfp_mask) __must_hold(local) +struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev, + u64 id, + sector_t sector, + unsigned int data_size, + gfp_t gfp_mask) __must_hold(local) { - struct drbd_peer_request *peer_req; + struct drbd_epoch_entry *e; struct page *page = NULL; unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT; if (drbd_insert_fault(mdev, DRBD_FAULT_AL_EE)) return NULL; - peer_req = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM); - if (!peer_req) { + e = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM); + if (!e) { if (!(gfp_mask & __GFP_NOWARN)) - dev_err(DEV, "%s: allocation failed\n", __func__); + dev_err(DEV, "alloc_ee: Allocation of an EE failed\n"); return NULL; } if (data_size) { - page = drbd_alloc_pages(mdev, nr_pages, (gfp_mask & __GFP_WAIT)); + page = drbd_pp_alloc(mdev, nr_pages, (gfp_mask & __GFP_WAIT)); if (!page) goto fail; } - drbd_clear_interval(&peer_req->i); - peer_req->i.size = data_size; - peer_req->i.sector = sector; - peer_req->i.local = false; - peer_req->i.waiting = false; - - peer_req->epoch = NULL; - peer_req->w.mdev = mdev; - peer_req->pages = page; - atomic_set(&peer_req->pending_bios, 0); - peer_req->flags = 0; - /* - * The block_id is opaque to the receiver. It is not endianness - * converted, and sent back to the sender unchanged. - */ - peer_req->block_id = id; + INIT_HLIST_NODE(&e->collision); + e->epoch = NULL; + e->mdev = mdev; + e->pages = page; + atomic_set(&e->pending_bios, 0); + e->size = data_size; + e->flags = 0; + e->sector = sector; + e->block_id = id; - return peer_req; + return e; fail: - mempool_free(peer_req, drbd_ee_mempool); + mempool_free(e, drbd_ee_mempool); return NULL; } -void __drbd_free_peer_req(struct drbd_conf *mdev, struct drbd_peer_request *peer_req, - int is_net) +void drbd_free_some_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, int is_net) { - if (peer_req->flags & EE_HAS_DIGEST) - kfree(peer_req->digest); - drbd_free_pages(mdev, peer_req->pages, is_net); - D_ASSERT(atomic_read(&peer_req->pending_bios) == 0); - D_ASSERT(drbd_interval_empty(&peer_req->i)); - mempool_free(peer_req, drbd_ee_mempool); + if (e->flags & EE_HAS_DIGEST) + kfree(e->digest); + drbd_pp_free(mdev, e->pages, is_net); + D_ASSERT(atomic_read(&e->pending_bios) == 0); + D_ASSERT(hlist_unhashed(&e->collision)); + mempool_free(e, drbd_ee_mempool); } -int drbd_free_peer_reqs(struct drbd_conf *mdev, struct list_head *list) +int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list) { LIST_HEAD(work_list); - struct drbd_peer_request *peer_req, *t; + struct drbd_epoch_entry *e, *t; int count = 0; int is_net = list == &mdev->net_ee; - spin_lock_irq(&mdev->tconn->req_lock); + spin_lock_irq(&mdev->req_lock); list_splice_init(list, &work_list); - spin_unlock_irq(&mdev->tconn->req_lock); + spin_unlock_irq(&mdev->req_lock); - list_for_each_entry_safe(peer_req, t, &work_list, w.list) { - __drbd_free_peer_req(mdev, peer_req, is_net); + list_for_each_entry_safe(e, t, &work_list, w.list) { + drbd_free_some_ee(mdev, e, is_net); count++; } return count; } + /* - * See also comments in _req_mod(,BARRIER_ACKED) and receive_Barrier. + * This function is called from _asender only_ + * but see also comments in _req_mod(,barrier_acked) + * and receive_Barrier. + * + * Move entries from net_ee to done_ee, if ready. + * Grab done_ee, call all callbacks, free the entries. + * The callbacks typically send out ACKs. */ -static int drbd_finish_peer_reqs(struct drbd_conf *mdev) +static int drbd_process_done_ee(struct drbd_conf *mdev) { LIST_HEAD(work_list); LIST_HEAD(reclaimed); - struct drbd_peer_request *peer_req, *t; - int err = 0; + struct drbd_epoch_entry *e, *t; + int ok = (mdev->state.conn >= C_WF_REPORT_PARAMS); - spin_lock_irq(&mdev->tconn->req_lock); - reclaim_finished_net_peer_reqs(mdev, &reclaimed); + spin_lock_irq(&mdev->req_lock); + reclaim_net_ee(mdev, &reclaimed); list_splice_init(&mdev->done_ee, &work_list); - spin_unlock_irq(&mdev->tconn->req_lock); + spin_unlock_irq(&mdev->req_lock); - list_for_each_entry_safe(peer_req, t, &reclaimed, w.list) - drbd_free_net_peer_req(mdev, peer_req); + list_for_each_entry_safe(e, t, &reclaimed, w.list) + drbd_free_net_ee(mdev, e); /* possible callbacks here: - * e_end_block, and e_end_resync_block, e_send_superseded. + * e_end_block, and e_end_resync_block, e_send_discard_ack. * all ignore the last argument. */ - list_for_each_entry_safe(peer_req, t, &work_list, w.list) { - int err2; - + list_for_each_entry_safe(e, t, &work_list, w.list) { /* list_del not necessary, next/prev members not touched */ - err2 = peer_req->w.cb(&peer_req->w, !!err); - if (!err) - err = err2; - drbd_free_peer_req(mdev, peer_req); + ok = e->w.cb(mdev, &e->w, !ok) && ok; + drbd_free_ee(mdev, e); } wake_up(&mdev->ee_wait); - return err; + return ok; } -static void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, - struct list_head *head) +void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head) { DEFINE_WAIT(wait); @@ -451,22 +430,55 @@ static void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, * and calling prepare_to_wait in the fast path */ while (!list_empty(head)) { prepare_to_wait(&mdev->ee_wait, &wait, TASK_UNINTERRUPTIBLE); - spin_unlock_irq(&mdev->tconn->req_lock); + spin_unlock_irq(&mdev->req_lock); io_schedule(); finish_wait(&mdev->ee_wait, &wait); - spin_lock_irq(&mdev->tconn->req_lock); + spin_lock_irq(&mdev->req_lock); } } -static void drbd_wait_ee_list_empty(struct drbd_conf *mdev, - struct list_head *head) +void drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head) { - spin_lock_irq(&mdev->tconn->req_lock); + spin_lock_irq(&mdev->req_lock); _drbd_wait_ee_list_empty(mdev, head); - spin_unlock_irq(&mdev->tconn->req_lock); + spin_unlock_irq(&mdev->req_lock); } -static int drbd_recv_short(struct socket *sock, void *buf, size_t size, int flags) +/* see also kernel_accept; which is only present since 2.6.18. + * also we want to log which part of it failed, exactly */ +static int drbd_accept(struct drbd_conf *mdev, const char **what, + struct socket *sock, struct socket **newsock) +{ + struct sock *sk = sock->sk; + int err = 0; + + *what = "listen"; + err = sock->ops->listen(sock, 5); + if (err < 0) + goto out; + + *what = "sock_create_lite"; + err = sock_create_lite(sk->sk_family, sk->sk_type, sk->sk_protocol, + newsock); + if (err < 0) + goto out; + + *what = "accept"; + err = sock->ops->accept(sock, *newsock, 0); + if (err < 0) { + sock_release(*newsock); + *newsock = NULL; + goto out; + } + (*newsock)->ops = sock->ops; + __module_get((*newsock)->ops->owner); + +out: + return err; +} + +static int drbd_recv_short(struct drbd_conf *mdev, struct socket *sock, + void *buf, size_t size, int flags) { mm_segment_t oldfs; struct kvec iov = { @@ -488,62 +500,59 @@ static int drbd_recv_short(struct socket *sock, void *buf, size_t size, int flag return rv; } -static int drbd_recv(struct drbd_tconn *tconn, void *buf, size_t size) +static int drbd_recv(struct drbd_conf *mdev, void *buf, size_t size) { + mm_segment_t oldfs; + struct kvec iov = { + .iov_base = buf, + .iov_len = size, + }; + struct msghdr msg = { + .msg_iovlen = 1, + .msg_iov = (struct iovec *)&iov, + .msg_flags = MSG_WAITALL | MSG_NOSIGNAL + }; int rv; - rv = drbd_recv_short(tconn->data.socket, buf, size, 0); + oldfs = get_fs(); + set_fs(KERNEL_DS); - if (rv < 0) { - if (rv == -ECONNRESET) - conn_info(tconn, "sock was reset by peer\n"); - else if (rv != -ERESTARTSYS) - conn_err(tconn, "sock_recvmsg returned %d\n", rv); - } else if (rv == 0) { - if (test_bit(DISCONNECT_SENT, &tconn->flags)) { - long t; - rcu_read_lock(); - t = rcu_dereference(tconn->net_conf)->ping_timeo * HZ/10; - rcu_read_unlock(); + for (;;) { + rv = sock_recvmsg(mdev->data.socket, &msg, size, msg.msg_flags); + if (rv == size) + break; - t = wait_event_timeout(tconn->ping_wait, tconn->cstate < C_WF_REPORT_PARAMS, t); + /* Note: + * ECONNRESET other side closed the connection + * ERESTARTSYS (on sock) we got a signal + */ - if (t) - goto out; + if (rv < 0) { + if (rv == -ECONNRESET) + dev_info(DEV, "sock was reset by peer\n"); + else if (rv != -ERESTARTSYS) + dev_err(DEV, "sock_recvmsg returned %d\n", rv); + break; + } else if (rv == 0) { + dev_info(DEV, "sock was shut down by peer\n"); + break; + } else { + /* signal came in, or peer/link went down, + * after we read a partial message + */ + /* D_ASSERT(signal_pending(current)); */ + break; } - conn_info(tconn, "sock was shut down by peer\n"); - } + }; + + set_fs(oldfs); if (rv != size) - conn_request_state(tconn, NS(conn, C_BROKEN_PIPE), CS_HARD); + drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE)); -out: return rv; } -static int drbd_recv_all(struct drbd_tconn *tconn, void *buf, size_t size) -{ - int err; - - err = drbd_recv(tconn, buf, size); - if (err != size) { - if (err >= 0) - err = -EIO; - } else - err = 0; - return err; -} - -static int drbd_recv_all_warn(struct drbd_tconn *tconn, void *buf, size_t size) -{ - int err; - - err = drbd_recv_all(tconn, buf, size); - if (err && !signal_pending(current)) - conn_warn(tconn, "short read (expected size %d)\n", (int)size); - return err; -} - /* quoting tcp(7): * On individual connections, the socket buffer size must be set prior to the * listen(2) or connect(2) calls in order to have it take effect. @@ -563,50 +572,29 @@ static void drbd_setbufsize(struct socket *sock, unsigned int snd, } } -static struct socket *drbd_try_connect(struct drbd_tconn *tconn) +static struct socket *drbd_try_connect(struct drbd_conf *mdev) { const char *what; struct socket *sock; struct sockaddr_in6 src_in6; - struct sockaddr_in6 peer_in6; - struct net_conf *nc; - int err, peer_addr_len, my_addr_len; - int sndbuf_size, rcvbuf_size, connect_int; + int err; int disconnect_on_error = 1; - rcu_read_lock(); - nc = rcu_dereference(tconn->net_conf); - if (!nc) { - rcu_read_unlock(); + if (!get_net_conf(mdev)) return NULL; - } - sndbuf_size = nc->sndbuf_size; - rcvbuf_size = nc->rcvbuf_size; - connect_int = nc->connect_int; - rcu_read_unlock(); - - my_addr_len = min_t(int, tconn->my_addr_len, sizeof(src_in6)); - memcpy(&src_in6, &tconn->my_addr, my_addr_len); - - if (((struct sockaddr *)&tconn->my_addr)->sa_family == AF_INET6) - src_in6.sin6_port = 0; - else - ((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */ - - peer_addr_len = min_t(int, tconn->peer_addr_len, sizeof(src_in6)); - memcpy(&peer_in6, &tconn->peer_addr, peer_addr_len); what = "sock_create_kern"; - err = sock_create_kern(((struct sockaddr *)&src_in6)->sa_family, - SOCK_STREAM, IPPROTO_TCP, &sock); + err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family, + SOCK_STREAM, IPPROTO_TCP, &sock); if (err < 0) { sock = NULL; goto out; } sock->sk->sk_rcvtimeo = - sock->sk->sk_sndtimeo = connect_int * HZ; - drbd_setbufsize(sock, sndbuf_size, rcvbuf_size); + sock->sk->sk_sndtimeo = mdev->net_conf->try_connect_int*HZ; + drbd_setbufsize(sock, mdev->net_conf->sndbuf_size, + mdev->net_conf->rcvbuf_size); /* explicitly bind to the configured IP as source IP * for the outgoing connections. @@ -615,8 +603,17 @@ static struct socket *drbd_try_connect(struct drbd_tconn *tconn) * Make sure to use 0 as port number, so linux selects * a free one dynamically. */ + memcpy(&src_in6, mdev->net_conf->my_addr, + min_t(int, mdev->net_conf->my_addr_len, sizeof(src_in6))); + if (((struct sockaddr *)mdev->net_conf->my_addr)->sa_family == AF_INET6) + src_in6.sin6_port = 0; + else + ((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */ + what = "bind before connect"; - err = sock->ops->bind(sock, (struct sockaddr *) &src_in6, my_addr_len); + err = sock->ops->bind(sock, + (struct sockaddr *) &src_in6, + mdev->net_conf->my_addr_len); if (err < 0) goto out; @@ -624,7 +621,9 @@ static struct socket *drbd_try_connect(struct drbd_tconn *tconn) * stay C_WF_CONNECTION, don't go Disconnecting! */ disconnect_on_error = 0; what = "connect"; - err = sock->ops->connect(sock, (struct sockaddr *) &peer_in6, peer_addr_len, 0); + err = sock->ops->connect(sock, + (struct sockaddr *)mdev->net_conf->peer_addr, + mdev->net_conf->peer_addr_len, 0); out: if (err < 0) { @@ -642,174 +641,91 @@ static struct socket *drbd_try_connect(struct drbd_tconn *tconn) disconnect_on_error = 0; break; default: - conn_err(tconn, "%s failed, err = %d\n", what, err); + dev_err(DEV, "%s failed, err = %d\n", what, err); } if (disconnect_on_error) - conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD); + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); } - + put_net_conf(mdev); return sock; } -struct accept_wait_data { - struct drbd_tconn *tconn; - struct socket *s_listen; - struct completion door_bell; - void (*original_sk_state_change)(struct sock *sk); - -}; - -static void drbd_incoming_connection(struct sock *sk) +static struct socket *drbd_wait_for_connect(struct drbd_conf *mdev) { - struct accept_wait_data *ad = sk->sk_user_data; - void (*state_change)(struct sock *sk); - - state_change = ad->original_sk_state_change; - if (sk->sk_state == TCP_ESTABLISHED) - complete(&ad->door_bell); - state_change(sk); -} - -static int prepare_listen_socket(struct drbd_tconn *tconn, struct accept_wait_data *ad) -{ - int err, sndbuf_size, rcvbuf_size, my_addr_len; - struct sockaddr_in6 my_addr; - struct socket *s_listen; - struct net_conf *nc; + int timeo, err; + struct socket *s_estab = NULL, *s_listen; const char *what; - rcu_read_lock(); - nc = rcu_dereference(tconn->net_conf); - if (!nc) { - rcu_read_unlock(); - return -EIO; - } - sndbuf_size = nc->sndbuf_size; - rcvbuf_size = nc->rcvbuf_size; - rcu_read_unlock(); - - my_addr_len = min_t(int, tconn->my_addr_len, sizeof(struct sockaddr_in6)); - memcpy(&my_addr, &tconn->my_addr, my_addr_len); + if (!get_net_conf(mdev)) + return NULL; what = "sock_create_kern"; - err = sock_create_kern(((struct sockaddr *)&my_addr)->sa_family, - SOCK_STREAM, IPPROTO_TCP, &s_listen); + err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family, + SOCK_STREAM, IPPROTO_TCP, &s_listen); if (err) { s_listen = NULL; goto out; } - s_listen->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */ - drbd_setbufsize(s_listen, sndbuf_size, rcvbuf_size); + timeo = mdev->net_conf->try_connect_int * HZ; + timeo += (random32() & 1) ? timeo / 7 : -timeo / 7; /* 28.5% random jitter */ + + s_listen->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */ + s_listen->sk->sk_rcvtimeo = timeo; + s_listen->sk->sk_sndtimeo = timeo; + drbd_setbufsize(s_listen, mdev->net_conf->sndbuf_size, + mdev->net_conf->rcvbuf_size); what = "bind before listen"; - err = s_listen->ops->bind(s_listen, (struct sockaddr *)&my_addr, my_addr_len); + err = s_listen->ops->bind(s_listen, + (struct sockaddr *) mdev->net_conf->my_addr, + mdev->net_conf->my_addr_len); if (err < 0) goto out; - ad->s_listen = s_listen; - write_lock_bh(&s_listen->sk->sk_callback_lock); - ad->original_sk_state_change = s_listen->sk->sk_state_change; - s_listen->sk->sk_state_change = drbd_incoming_connection; - s_listen->sk->sk_user_data = ad; - write_unlock_bh(&s_listen->sk->sk_callback_lock); - - what = "listen"; - err = s_listen->ops->listen(s_listen, 5); - if (err < 0) - goto out; + err = drbd_accept(mdev, &what, s_listen, &s_estab); - return 0; out: if (s_listen) sock_release(s_listen); if (err < 0) { if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) { - conn_err(tconn, "%s failed, err = %d\n", what, err); - conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD); + dev_err(DEV, "%s failed, err = %d\n", what, err); + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); } } + put_net_conf(mdev); - return -EIO; -} - -static void unregister_state_change(struct sock *sk, struct accept_wait_data *ad) -{ - write_lock_bh(&sk->sk_callback_lock); - sk->sk_state_change = ad->original_sk_state_change; - sk->sk_user_data = NULL; - write_unlock_bh(&sk->sk_callback_lock); + return s_estab; } -static struct socket *drbd_wait_for_connect(struct drbd_tconn *tconn, struct accept_wait_data *ad) +static int drbd_send_fp(struct drbd_conf *mdev, + struct socket *sock, enum drbd_packets cmd) { - int timeo, connect_int, err = 0; - struct socket *s_estab = NULL; - struct net_conf *nc; - - rcu_read_lock(); - nc = rcu_dereference(tconn->net_conf); - if (!nc) { - rcu_read_unlock(); - return NULL; - } - connect_int = nc->connect_int; - rcu_read_unlock(); - - timeo = connect_int * HZ; - timeo += (random32() & 1) ? timeo / 7 : -timeo / 7; /* 28.5% random jitter */ - - err = wait_for_completion_interruptible_timeout(&ad->door_bell, timeo); - if (err <= 0) - return NULL; - - err = kernel_accept(ad->s_listen, &s_estab, 0); - if (err < 0) { - if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) { - conn_err(tconn, "accept failed, err = %d\n", err); - conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD); - } - } - - if (s_estab) - unregister_state_change(s_estab->sk, ad); + struct p_header80 *h = &mdev->data.sbuf.header.h80; - return s_estab; + return _drbd_send_cmd(mdev, sock, cmd, h, sizeof(*h), 0); } -static int decode_header(struct drbd_tconn *, void *, struct packet_info *); - -static int send_first_packet(struct drbd_tconn *tconn, struct drbd_socket *sock, - enum drbd_packet cmd) +static enum drbd_packets drbd_recv_fp(struct drbd_conf *mdev, struct socket *sock) { - if (!conn_prepare_command(tconn, sock)) - return -EIO; - return conn_send_command(tconn, sock, cmd, 0, NULL, 0); -} + struct p_header80 *h = &mdev->data.rbuf.header.h80; + int rr; -static int receive_first_packet(struct drbd_tconn *tconn, struct socket *sock) -{ - unsigned int header_size = drbd_header_size(tconn); - struct packet_info pi; - int err; + rr = drbd_recv_short(mdev, sock, h, sizeof(*h), 0); - err = drbd_recv_short(sock, tconn->data.rbuf, header_size, 0); - if (err != header_size) { - if (err >= 0) - err = -EIO; - return err; - } - err = decode_header(tconn, tconn->data.rbuf, &pi); - if (err) - return err; - return pi.cmd; + if (rr == sizeof(*h) && h->magic == BE_DRBD_MAGIC) + return be16_to_cpu(h->command); + + return 0xffff; } /** * drbd_socket_okay() - Free the socket if its connection is not okay + * @mdev: DRBD device. * @sock: pointer to the pointer to the socket. */ -static int drbd_socket_okay(struct socket **sock) +static int drbd_socket_okay(struct drbd_conf *mdev, struct socket **sock) { int rr; char tb[4]; @@ -817,7 +733,7 @@ static int drbd_socket_okay(struct socket **sock) if (!*sock) return false; - rr = drbd_recv_short(*sock, tb, 4, MSG_DONTWAIT | MSG_PEEK); + rr = drbd_recv_short(mdev, *sock, tb, 4, MSG_DONTWAIT | MSG_PEEK); if (rr > 0 || rr == -EAGAIN) { return true; @@ -827,31 +743,6 @@ static int drbd_socket_okay(struct socket **sock) return false; } } -/* Gets called if a connection is established, or if a new minor gets created - in a connection */ -int drbd_connected(struct drbd_conf *mdev) -{ - int err; - - atomic_set(&mdev->packet_seq, 0); - mdev->peer_seq = 0; - - mdev->state_mutex = mdev->tconn->agreed_pro_version < 100 ? - &mdev->tconn->cstate_mutex : - &mdev->own_state_mutex; - - err = drbd_send_sync_param(mdev); - if (!err) - err = drbd_send_sizes(mdev, 0, 0); - if (!err) - err = drbd_send_uuids(mdev); - if (!err) - err = drbd_send_current_state(mdev); - clear_bit(USE_DEGR_WFC_T, &mdev->flags); - clear_bit(RESIZE_PENDING, &mdev->flags); - mod_timer(&mdev->request_timer, jiffies + HZ); /* just start it here. */ - return err; -} /* * return values: @@ -861,315 +752,232 @@ int drbd_connected(struct drbd_conf *mdev) * no point in trying again, please go standalone. * -2 We do not have a network config... */ -static int conn_connect(struct drbd_tconn *tconn) +static int drbd_connect(struct drbd_conf *mdev) { - struct drbd_socket sock, msock; - struct drbd_conf *mdev; - struct net_conf *nc; - int vnr, timeout, h, ok; - bool discard_my_data; + struct socket *s, *sock, *msock; + int try, h, ok; enum drbd_state_rv rv; - struct accept_wait_data ad = { - .tconn = tconn, - .door_bell = COMPLETION_INITIALIZER_ONSTACK(ad.door_bell), - }; - clear_bit(DISCONNECT_SENT, &tconn->flags); - if (conn_request_state(tconn, NS(conn, C_WF_CONNECTION), CS_VERBOSE) < SS_SUCCESS) + D_ASSERT(!mdev->data.socket); + + if (drbd_request_state(mdev, NS(conn, C_WF_CONNECTION)) < SS_SUCCESS) return -2; - mutex_init(&sock.mutex); - sock.sbuf = tconn->data.sbuf; - sock.rbuf = tconn->data.rbuf; - sock.socket = NULL; - mutex_init(&msock.mutex); - msock.sbuf = tconn->meta.sbuf; - msock.rbuf = tconn->meta.rbuf; - msock.socket = NULL; + clear_bit(DISCARD_CONCURRENT, &mdev->flags); - /* Assume that the peer only understands protocol 80 until we know better. */ - tconn->agreed_pro_version = 80; - - if (prepare_listen_socket(tconn, &ad)) - return 0; + sock = NULL; + msock = NULL; do { - struct socket *s; + for (try = 0;;) { + /* 3 tries, this should take less than a second! */ + s = drbd_try_connect(mdev); + if (s || ++try >= 3) + break; + /* give the other side time to call bind() & listen() */ + schedule_timeout_interruptible(HZ / 10); + } - s = drbd_try_connect(tconn); if (s) { - if (!sock.socket) { - sock.socket = s; - send_first_packet(tconn, &sock, P_INITIAL_DATA); - } else if (!msock.socket) { - clear_bit(RESOLVE_CONFLICTS, &tconn->flags); - msock.socket = s; - send_first_packet(tconn, &msock, P_INITIAL_META); + if (!sock) { + drbd_send_fp(mdev, s, P_HAND_SHAKE_S); + sock = s; + s = NULL; + } else if (!msock) { + drbd_send_fp(mdev, s, P_HAND_SHAKE_M); + msock = s; + s = NULL; } else { - conn_err(tconn, "Logic error in conn_connect()\n"); + dev_err(DEV, "Logic error in drbd_connect()\n"); goto out_release_sockets; } } - if (sock.socket && msock.socket) { - rcu_read_lock(); - nc = rcu_dereference(tconn->net_conf); - timeout = nc->ping_timeo * HZ / 10; - rcu_read_unlock(); - schedule_timeout_interruptible(timeout); - ok = drbd_socket_okay(&sock.socket); - ok = drbd_socket_okay(&msock.socket) && ok; + if (sock && msock) { + schedule_timeout_interruptible(mdev->net_conf->ping_timeo*HZ/10); + ok = drbd_socket_okay(mdev, &sock); + ok = drbd_socket_okay(mdev, &msock) && ok; if (ok) break; } retry: - s = drbd_wait_for_connect(tconn, &ad); + s = drbd_wait_for_connect(mdev); if (s) { - int fp = receive_first_packet(tconn, s); - drbd_socket_okay(&sock.socket); - drbd_socket_okay(&msock.socket); - switch (fp) { - case P_INITIAL_DATA: - if (sock.socket) { - conn_warn(tconn, "initial packet S crossed\n"); - sock_release(sock.socket); - sock.socket = s; - goto randomize; + try = drbd_recv_fp(mdev, s); + drbd_socket_okay(mdev, &sock); + drbd_socket_okay(mdev, &msock); + switch (try) { + case P_HAND_SHAKE_S: + if (sock) { + dev_warn(DEV, "initial packet S crossed\n"); + sock_release(sock); } - sock.socket = s; + sock = s; break; - case P_INITIAL_META: - set_bit(RESOLVE_CONFLICTS, &tconn->flags); - if (msock.socket) { - conn_warn(tconn, "initial packet M crossed\n"); - sock_release(msock.socket); - msock.socket = s; - goto randomize; + case P_HAND_SHAKE_M: + if (msock) { + dev_warn(DEV, "initial packet M crossed\n"); + sock_release(msock); } - msock.socket = s; + msock = s; + set_bit(DISCARD_CONCURRENT, &mdev->flags); break; default: - conn_warn(tconn, "Error receiving initial packet\n"); + dev_warn(DEV, "Error receiving initial packet\n"); sock_release(s); -randomize: if (random32() & 1) goto retry; } } - if (tconn->cstate <= C_DISCONNECTING) + if (mdev->state.conn <= C_DISCONNECTING) goto out_release_sockets; if (signal_pending(current)) { flush_signals(current); smp_rmb(); - if (get_t_state(&tconn->receiver) == EXITING) + if (get_t_state(&mdev->receiver) == Exiting) goto out_release_sockets; } - ok = drbd_socket_okay(&sock.socket); - ok = drbd_socket_okay(&msock.socket) && ok; - } while (!ok); - - if (ad.s_listen) - sock_release(ad.s_listen); + if (sock && msock) { + ok = drbd_socket_okay(mdev, &sock); + ok = drbd_socket_okay(mdev, &msock) && ok; + if (ok) + break; + } + } while (1); - sock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */ - msock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */ + msock->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */ + sock->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */ - sock.socket->sk->sk_allocation = GFP_NOIO; - msock.socket->sk->sk_allocation = GFP_NOIO; + sock->sk->sk_allocation = GFP_NOIO; + msock->sk->sk_allocation = GFP_NOIO; - sock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK; - msock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE; + sock->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK; + msock->sk->sk_priority = TC_PRIO_INTERACTIVE; /* NOT YET ... - * sock.socket->sk->sk_sndtimeo = tconn->net_conf->timeout*HZ/10; - * sock.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; - * first set it to the P_CONNECTION_FEATURES timeout, + * sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10; + * sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; + * first set it to the P_HAND_SHAKE timeout, * which we set to 4x the configured ping_timeout. */ - rcu_read_lock(); - nc = rcu_dereference(tconn->net_conf); - - sock.socket->sk->sk_sndtimeo = - sock.socket->sk->sk_rcvtimeo = nc->ping_timeo*4*HZ/10; - - msock.socket->sk->sk_rcvtimeo = nc->ping_int*HZ; - timeout = nc->timeout * HZ / 10; - discard_my_data = nc->discard_my_data; - rcu_read_unlock(); + sock->sk->sk_sndtimeo = + sock->sk->sk_rcvtimeo = mdev->net_conf->ping_timeo*4*HZ/10; - msock.socket->sk->sk_sndtimeo = timeout; + msock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10; + msock->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ; /* we don't want delays. * we use TCP_CORK where appropriate, though */ - drbd_tcp_nodelay(sock.socket); - drbd_tcp_nodelay(msock.socket); + drbd_tcp_nodelay(sock); + drbd_tcp_nodelay(msock); - tconn->data.socket = sock.socket; - tconn->meta.socket = msock.socket; - tconn->last_received = jiffies; + mdev->data.socket = sock; + mdev->meta.socket = msock; + mdev->last_received = jiffies; - h = drbd_do_features(tconn); + D_ASSERT(mdev->asender.task == NULL); + + h = drbd_do_handshake(mdev); if (h <= 0) return h; - if (tconn->cram_hmac_tfm) { + if (mdev->cram_hmac_tfm) { /* drbd_request_state(mdev, NS(conn, WFAuth)); */ - switch (drbd_do_auth(tconn)) { + switch (drbd_do_auth(mdev)) { case -1: - conn_err(tconn, "Authentication of peer failed\n"); + dev_err(DEV, "Authentication of peer failed\n"); return -1; case 0: - conn_err(tconn, "Authentication of peer failed, trying again.\n"); + dev_err(DEV, "Authentication of peer failed, trying again.\n"); return 0; } } - tconn->data.socket->sk->sk_sndtimeo = timeout; - tconn->data.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; - - if (drbd_send_protocol(tconn) == -EOPNOTSUPP) - return -1; - - set_bit(STATE_SENT, &tconn->flags); - - rcu_read_lock(); - idr_for_each_entry(&tconn->volumes, mdev, vnr) { - kref_get(&mdev->kref); - /* Prevent a race between resync-handshake and - * being promoted to Primary. - * - * Grab and release the state mutex, so we know that any current - * drbd_set_role() is finished, and any incoming drbd_set_role - * will see the STATE_SENT flag, and wait for it to be cleared. - */ - mutex_lock(mdev->state_mutex); - mutex_unlock(mdev->state_mutex); + sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10; + sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; - rcu_read_unlock(); + atomic_set(&mdev->packet_seq, 0); + mdev->peer_seq = 0; - if (discard_my_data) - set_bit(DISCARD_MY_DATA, &mdev->flags); - else - clear_bit(DISCARD_MY_DATA, &mdev->flags); + if (drbd_send_protocol(mdev) == -1) + return -1; + set_bit(STATE_SENT, &mdev->flags); + drbd_send_sync_param(mdev, &mdev->sync_conf); + drbd_send_sizes(mdev, 0, 0); + drbd_send_uuids(mdev); + drbd_send_current_state(mdev); + clear_bit(USE_DEGR_WFC_T, &mdev->flags); + clear_bit(RESIZE_PENDING, &mdev->flags); - drbd_connected(mdev); - kref_put(&mdev->kref, &drbd_minor_destroy); - rcu_read_lock(); - } - rcu_read_unlock(); + spin_lock_irq(&mdev->req_lock); + rv = _drbd_set_state(_NS(mdev, conn, C_WF_REPORT_PARAMS), CS_VERBOSE, NULL); + if (mdev->state.conn != C_WF_REPORT_PARAMS) + clear_bit(STATE_SENT, &mdev->flags); + spin_unlock_irq(&mdev->req_lock); - rv = conn_request_state(tconn, NS(conn, C_WF_REPORT_PARAMS), CS_VERBOSE); - if (rv < SS_SUCCESS || tconn->cstate != C_WF_REPORT_PARAMS) { - clear_bit(STATE_SENT, &tconn->flags); + if (rv < SS_SUCCESS) return 0; - } - - drbd_thread_start(&tconn->asender); - mutex_lock(&tconn->conf_update); - /* The discard_my_data flag is a single-shot modifier to the next - * connection attempt, the handshake of which is now well underway. - * No need for rcu style copying of the whole struct - * just to clear a single value. */ - tconn->net_conf->discard_my_data = 0; - mutex_unlock(&tconn->conf_update); + drbd_thread_start(&mdev->asender); + mod_timer(&mdev->request_timer, jiffies + HZ); /* just start it here. */ - return h; + return 1; out_release_sockets: - if (ad.s_listen) - sock_release(ad.s_listen); - if (sock.socket) - sock_release(sock.socket); - if (msock.socket) - sock_release(msock.socket); + if (sock) + sock_release(sock); + if (msock) + sock_release(msock); return -1; } -static int decode_header(struct drbd_tconn *tconn, void *header, struct packet_info *pi) +static int drbd_recv_header(struct drbd_conf *mdev, enum drbd_packets *cmd, unsigned int *packet_size) { - unsigned int header_size = drbd_header_size(tconn); - - if (header_size == sizeof(struct p_header100) && - *(__be32 *)header == cpu_to_be32(DRBD_MAGIC_100)) { - struct p_header100 *h = header; - if (h->pad != 0) { - conn_err(tconn, "Header padding is not zero\n"); - return -EINVAL; - } - pi->vnr = be16_to_cpu(h->volume); - pi->cmd = be16_to_cpu(h->command); - pi->size = be32_to_cpu(h->length); - } else if (header_size == sizeof(struct p_header95) && - *(__be16 *)header == cpu_to_be16(DRBD_MAGIC_BIG)) { - struct p_header95 *h = header; - pi->cmd = be16_to_cpu(h->command); - pi->size = be32_to_cpu(h->length); - pi->vnr = 0; - } else if (header_size == sizeof(struct p_header80) && - *(__be32 *)header == cpu_to_be32(DRBD_MAGIC)) { - struct p_header80 *h = header; - pi->cmd = be16_to_cpu(h->command); - pi->size = be16_to_cpu(h->length); - pi->vnr = 0; - } else { - conn_err(tconn, "Wrong magic value 0x%08x in protocol version %d\n", - be32_to_cpu(*(__be32 *)header), - tconn->agreed_pro_version); - return -EINVAL; - } - pi->data = header + header_size; - return 0; -} - -static int drbd_recv_header(struct drbd_tconn *tconn, struct packet_info *pi) -{ - void *buffer = tconn->data.rbuf; - int err; + union p_header *h = &mdev->data.rbuf.header; + int r; - err = drbd_recv_all_warn(tconn, buffer, drbd_header_size(tconn)); - if (err) - return err; + r = drbd_recv(mdev, h, sizeof(*h)); + if (unlikely(r != sizeof(*h))) { + if (!signal_pending(current)) + dev_warn(DEV, "short read expecting header on sock: r=%d\n", r); + return false; + } - err = decode_header(tconn, buffer, pi); - tconn->last_received = jiffies; + if (likely(h->h80.magic == BE_DRBD_MAGIC)) { + *cmd = be16_to_cpu(h->h80.command); + *packet_size = be16_to_cpu(h->h80.length); + } else if (h->h95.magic == BE_DRBD_MAGIC_BIG) { + *cmd = be16_to_cpu(h->h95.command); + *packet_size = be32_to_cpu(h->h95.length); + } else { + dev_err(DEV, "magic?? on data m: 0x%08x c: %d l: %d\n", + be32_to_cpu(h->h80.magic), + be16_to_cpu(h->h80.command), + be16_to_cpu(h->h80.length)); + return false; + } + mdev->last_received = jiffies; - return err; + return true; } -static void drbd_flush(struct drbd_tconn *tconn) +static void drbd_flush(struct drbd_conf *mdev) { int rv; - struct drbd_conf *mdev; - int vnr; - - if (tconn->write_ordering >= WO_bdev_flush) { - rcu_read_lock(); - idr_for_each_entry(&tconn->volumes, mdev, vnr) { - if (!get_ldev(mdev)) - continue; - kref_get(&mdev->kref); - rcu_read_unlock(); - - rv = blkdev_issue_flush(mdev->ldev->backing_bdev, - GFP_NOIO, NULL); - if (rv) { - dev_info(DEV, "local disk flush failed with status %d\n", rv); - /* would rather check on EOPNOTSUPP, but that is not reliable. - * don't try again for ANY return value != 0 - * if (rv == -EOPNOTSUPP) */ - drbd_bump_write_ordering(tconn, WO_drain_io); - } - put_ldev(mdev); - kref_put(&mdev->kref, &drbd_minor_destroy); - rcu_read_lock(); - if (rv) - break; + if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) { + rv = blkdev_issue_flush(mdev->ldev->backing_bdev, GFP_KERNEL, + NULL); + if (rv) { + dev_info(DEV, "local disk flush failed with status %d\n", rv); + /* would rather check on EOPNOTSUPP, but that is not reliable. + * don't try again for ANY return value != 0 + * if (rv == -EOPNOTSUPP) */ + drbd_bump_write_ordering(mdev, WO_drain_io); } - rcu_read_unlock(); + put_ldev(mdev); } } @@ -1179,7 +987,7 @@ static void drbd_flush(struct drbd_tconn *tconn) * @epoch: Epoch object. * @ev: Epoch event. */ -static enum finish_epoch drbd_may_finish_epoch(struct drbd_tconn *tconn, +static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch, enum epoch_event ev) { @@ -1187,7 +995,7 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_tconn *tconn, struct drbd_epoch *next_epoch; enum finish_epoch rv = FE_STILL_LIVE; - spin_lock(&tconn->epoch_lock); + spin_lock(&mdev->epoch_lock); do { next_epoch = NULL; @@ -1209,22 +1017,18 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_tconn *tconn, atomic_read(&epoch->active) == 0 && (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) || ev & EV_CLEANUP)) { if (!(ev & EV_CLEANUP)) { - spin_unlock(&tconn->epoch_lock); - drbd_send_b_ack(epoch->tconn, epoch->barrier_nr, epoch_size); - spin_lock(&tconn->epoch_lock); + spin_unlock(&mdev->epoch_lock); + drbd_send_b_ack(mdev, epoch->barrier_nr, epoch_size); + spin_lock(&mdev->epoch_lock); } -#if 0 - /* FIXME: dec unacked on connection, once we have - * something to count pending connection packets in. */ if (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags)) - dec_unacked(epoch->tconn); -#endif + dec_unacked(mdev); - if (tconn->current_epoch != epoch) { + if (mdev->current_epoch != epoch) { next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list); list_del(&epoch->list); ev = EV_BECAME_LAST | (ev & EV_CLEANUP); - tconn->epochs--; + mdev->epochs--; kfree(epoch); if (rv == FE_STILL_LIVE) @@ -1235,6 +1039,7 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_tconn *tconn, /* atomic_set(&epoch->active, 0); is already zero */ if (rv == FE_STILL_LIVE) rv = FE_RECYCLED; + wake_up(&mdev->ee_wait); } } @@ -1244,52 +1049,40 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_tconn *tconn, epoch = next_epoch; } while (1); - spin_unlock(&tconn->epoch_lock); + spin_unlock(&mdev->epoch_lock); return rv; } /** * drbd_bump_write_ordering() - Fall back to an other write ordering method - * @tconn: DRBD connection. + * @mdev: DRBD device. * @wo: Write ordering method to try. */ -void drbd_bump_write_ordering(struct drbd_tconn *tconn, enum write_ordering_e wo) +void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo) __must_hold(local) { - struct disk_conf *dc; - struct drbd_conf *mdev; enum write_ordering_e pwo; - int vnr; static char *write_ordering_str[] = { [WO_none] = "none", [WO_drain_io] = "drain", [WO_bdev_flush] = "flush", }; - pwo = tconn->write_ordering; + pwo = mdev->write_ordering; wo = min(pwo, wo); - rcu_read_lock(); - idr_for_each_entry(&tconn->volumes, mdev, vnr) { - if (!get_ldev_if_state(mdev, D_ATTACHING)) - continue; - dc = rcu_dereference(mdev->ldev->disk_conf); - - if (wo == WO_bdev_flush && !dc->disk_flushes) - wo = WO_drain_io; - if (wo == WO_drain_io && !dc->disk_drain) - wo = WO_none; - put_ldev(mdev); - } - rcu_read_unlock(); - tconn->write_ordering = wo; - if (pwo != tconn->write_ordering || wo == WO_bdev_flush) - conn_info(tconn, "Method to ensure write ordering: %s\n", write_ordering_str[tconn->write_ordering]); + if (wo == WO_bdev_flush && mdev->ldev->dc.no_disk_flush) + wo = WO_drain_io; + if (wo == WO_drain_io && mdev->ldev->dc.no_disk_drain) + wo = WO_none; + mdev->write_ordering = wo; + if (pwo != mdev->write_ordering || wo == WO_bdev_flush) + dev_info(DEV, "Method to ensure write ordering: %s\n", write_ordering_str[mdev->write_ordering]); } /** - * drbd_submit_peer_request() + * drbd_submit_ee() * @mdev: DRBD device. - * @peer_req: peer request + * @e: epoch entry * @rw: flag field, see bio->bi_rw * * May spread the pages to multiple bios, @@ -1303,15 +1096,14 @@ void drbd_bump_write_ordering(struct drbd_tconn *tconn, enum write_ordering_e wo * on certain Xen deployments. */ /* TODO allocate from our own bio_set. */ -int drbd_submit_peer_request(struct drbd_conf *mdev, - struct drbd_peer_request *peer_req, - const unsigned rw, const int fault_type) +int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, + const unsigned rw, const int fault_type) { struct bio *bios = NULL; struct bio *bio; - struct page *page = peer_req->pages; - sector_t sector = peer_req->i.sector; - unsigned ds = peer_req->i.size; + struct page *page = e->pages; + sector_t sector = e->sector; + unsigned ds = e->size; unsigned n_bios = 0; unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT; int err = -ENOMEM; @@ -1330,12 +1122,12 @@ int drbd_submit_peer_request(struct drbd_conf *mdev, dev_err(DEV, "submit_ee: Allocation of a bio failed\n"); goto fail; } - /* > peer_req->i.sector, unless this is the first bio */ + /* > e->sector, unless this is the first bio */ bio->bi_sector = sector; bio->bi_bdev = mdev->ldev->backing_bdev; bio->bi_rw = rw; - bio->bi_private = peer_req; - bio->bi_end_io = drbd_peer_request_endio; + bio->bi_private = e; + bio->bi_end_io = drbd_endio_sec; bio->bi_next = bios; bios = bio; @@ -1364,7 +1156,7 @@ int drbd_submit_peer_request(struct drbd_conf *mdev, D_ASSERT(page == NULL); D_ASSERT(ds == 0); - atomic_set(&peer_req->pending_bios, n_bios); + atomic_set(&e->pending_bios, n_bios); do { bio = bios; bios = bios->bi_next; @@ -1383,57 +1175,26 @@ int drbd_submit_peer_request(struct drbd_conf *mdev, return err; } -static void drbd_remove_epoch_entry_interval(struct drbd_conf *mdev, - struct drbd_peer_request *peer_req) -{ - struct drbd_interval *i = &peer_req->i; - - drbd_remove_interval(&mdev->write_requests, i); - drbd_clear_interval(i); - - /* Wake up any processes waiting for this peer request to complete. */ - if (i->waiting) - wake_up(&mdev->misc_wait); -} - -void conn_wait_active_ee_empty(struct drbd_tconn *tconn) -{ - struct drbd_conf *mdev; - int vnr; - - rcu_read_lock(); - idr_for_each_entry(&tconn->volumes, mdev, vnr) { - kref_get(&mdev->kref); - rcu_read_unlock(); - drbd_wait_ee_list_empty(mdev, &mdev->active_ee); - kref_put(&mdev->kref, &drbd_minor_destroy); - rcu_read_lock(); - } - rcu_read_unlock(); -} - -static int receive_Barrier(struct drbd_tconn *tconn, struct packet_info *pi) +static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) { int rv; - struct p_barrier *p = pi->data; + struct p_barrier *p = &mdev->data.rbuf.barrier; struct drbd_epoch *epoch; - /* FIXME these are unacked on connection, - * not a specific (peer)device. - */ - tconn->current_epoch->barrier_nr = p->barrier; - tconn->current_epoch->tconn = tconn; - rv = drbd_may_finish_epoch(tconn, tconn->current_epoch, EV_GOT_BARRIER_NR); + inc_unacked(mdev); + + mdev->current_epoch->barrier_nr = p->barrier; + rv = drbd_may_finish_epoch(mdev, mdev->current_epoch, EV_GOT_BARRIER_NR); /* P_BARRIER_ACK may imply that the corresponding extent is dropped from * the activity log, which means it would not be resynced in case the * R_PRIMARY crashes now. * Therefore we must send the barrier_ack after the barrier request was * completed. */ - switch (tconn->write_ordering) { + switch (mdev->write_ordering) { case WO_none: if (rv == FE_RECYCLED) - return 0; + return true; /* receiver context, in the writeout path of the other node. * avoid potential distributed deadlock */ @@ -1441,75 +1202,81 @@ static int receive_Barrier(struct drbd_tconn *tconn, struct packet_info *pi) if (epoch) break; else - conn_warn(tconn, "Allocation of an epoch failed, slowing down\n"); + dev_warn(DEV, "Allocation of an epoch failed, slowing down\n"); /* Fall through */ case WO_bdev_flush: case WO_drain_io: - conn_wait_active_ee_empty(tconn); - drbd_flush(tconn); + drbd_wait_ee_list_empty(mdev, &mdev->active_ee); + drbd_flush(mdev); - if (atomic_read(&tconn->current_epoch->epoch_size)) { + if (atomic_read(&mdev->current_epoch->epoch_size)) { epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO); if (epoch) break; } - return 0; + epoch = mdev->current_epoch; + wait_event(mdev->ee_wait, atomic_read(&epoch->epoch_size) == 0); + + D_ASSERT(atomic_read(&epoch->active) == 0); + D_ASSERT(epoch->flags == 0); + + return true; default: - conn_err(tconn, "Strangeness in tconn->write_ordering %d\n", tconn->write_ordering); - return -EIO; + dev_err(DEV, "Strangeness in mdev->write_ordering %d\n", mdev->write_ordering); + return false; } epoch->flags = 0; atomic_set(&epoch->epoch_size, 0); atomic_set(&epoch->active, 0); - spin_lock(&tconn->epoch_lock); - if (atomic_read(&tconn->current_epoch->epoch_size)) { - list_add(&epoch->list, &tconn->current_epoch->list); - tconn->current_epoch = epoch; - tconn->epochs++; + spin_lock(&mdev->epoch_lock); + if (atomic_read(&mdev->current_epoch->epoch_size)) { + list_add(&epoch->list, &mdev->current_epoch->list); + mdev->current_epoch = epoch; + mdev->epochs++; } else { /* The current_epoch got recycled while we allocated this one... */ kfree(epoch); } - spin_unlock(&tconn->epoch_lock); + spin_unlock(&mdev->epoch_lock); - return 0; + return true; } /* used from receive_RSDataReply (recv_resync_read) * and from receive_Data */ -static struct drbd_peer_request * -read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, - int data_size) __must_hold(local) +static struct drbd_epoch_entry * +read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __must_hold(local) { const sector_t capacity = drbd_get_capacity(mdev->this_bdev); - struct drbd_peer_request *peer_req; + struct drbd_epoch_entry *e; struct page *page; - int dgs, ds, err; - void *dig_in = mdev->tconn->int_dig_in; - void *dig_vv = mdev->tconn->int_dig_vv; + int dgs, ds, rr; + void *dig_in = mdev->int_dig_in; + void *dig_vv = mdev->int_dig_vv; unsigned long *data; - dgs = 0; - if (mdev->tconn->peer_integrity_tfm) { - dgs = crypto_hash_digestsize(mdev->tconn->peer_integrity_tfm); - /* - * FIXME: Receive the incoming digest into the receive buffer - * here, together with its struct p_data? - */ - err = drbd_recv_all_warn(mdev->tconn, dig_in, dgs); - if (err) + dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ? + crypto_hash_digestsize(mdev->integrity_r_tfm) : 0; + + if (dgs) { + rr = drbd_recv(mdev, dig_in, dgs); + if (rr != dgs) { + if (!signal_pending(current)) + dev_warn(DEV, + "short read receiving data digest: read %d expected %d\n", + rr, dgs); return NULL; - data_size -= dgs; + } } - if (!expect(IS_ALIGNED(data_size, 512))) - return NULL; - if (!expect(data_size <= DRBD_MAX_BIO_SIZE)) - return NULL; + data_size -= dgs; + + ERR_IF(data_size & 0x1ff) return NULL; + ERR_IF(data_size > DRBD_MAX_BIO_SIZE) return NULL; /* even though we trust out peer, * we sometimes have to double check. */ @@ -1524,42 +1291,47 @@ read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD * "criss-cross" setup, that might cause write-out on some other DRBD, * which in turn might block on the other node at this very place. */ - peer_req = drbd_alloc_peer_req(mdev, id, sector, data_size, GFP_NOIO); - if (!peer_req) + e = drbd_alloc_ee(mdev, id, sector, data_size, GFP_NOIO); + if (!e) return NULL; if (!data_size) - return peer_req; + return e; ds = data_size; - page = peer_req->pages; + page = e->pages; page_chain_for_each(page) { unsigned len = min_t(int, ds, PAGE_SIZE); data = kmap(page); - err = drbd_recv_all_warn(mdev->tconn, data, len); + rr = drbd_recv(mdev, data, len); if (drbd_insert_fault(mdev, DRBD_FAULT_RECEIVE)) { dev_err(DEV, "Fault injection: Corrupting data on receive\n"); data[0] = data[0] ^ (unsigned long)-1; } kunmap(page); - if (err) { - drbd_free_peer_req(mdev, peer_req); + if (rr != len) { + drbd_free_ee(mdev, e); + if (!signal_pending(current)) + dev_warn(DEV, "short read receiving data: read %d expected %d\n", + rr, len); return NULL; } - ds -= len; + ds -= rr; } if (dgs) { - drbd_csum_ee(mdev, mdev->tconn->peer_integrity_tfm, peer_req, dig_vv); + drbd_csum_ee(mdev, mdev->integrity_r_tfm, e, dig_vv); if (memcmp(dig_in, dig_vv, dgs)) { dev_err(DEV, "Digest integrity check FAILED: %llus +%u\n", (unsigned long long)sector, data_size); - drbd_free_peer_req(mdev, peer_req); + drbd_bcast_ee(mdev, "digest failed", + dgs, dig_in, dig_vv, e); + drbd_free_ee(mdev, e); return NULL; } } mdev->recv_cnt += data_size>>9; - return peer_req; + return e; } /* drbd_drain_block() just takes a data block @@ -1568,26 +1340,30 @@ read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, static int drbd_drain_block(struct drbd_conf *mdev, int data_size) { struct page *page; - int err = 0; + int rr, rv = 1; void *data; if (!data_size) - return 0; + return true; - page = drbd_alloc_pages(mdev, 1, 1); + page = drbd_pp_alloc(mdev, 1, 1); data = kmap(page); while (data_size) { - unsigned int len = min_t(int, data_size, PAGE_SIZE); - - err = drbd_recv_all_warn(mdev->tconn, data, len); - if (err) + rr = drbd_recv(mdev, data, min_t(int, data_size, PAGE_SIZE)); + if (rr != min_t(int, data_size, PAGE_SIZE)) { + rv = 0; + if (!signal_pending(current)) + dev_warn(DEV, + "short read receiving data: read %d expected %d\n", + rr, min_t(int, data_size, PAGE_SIZE)); break; - data_size -= len; + } + data_size -= rr; } kunmap(page); - drbd_free_pages(mdev, page, 0); - return err; + drbd_pp_free(mdev, page, 0); + return rv; } static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req, @@ -1595,19 +1371,26 @@ static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req, { struct bio_vec *bvec; struct bio *bio; - int dgs, err, i, expect; - void *dig_in = mdev->tconn->int_dig_in; - void *dig_vv = mdev->tconn->int_dig_vv; + int dgs, rr, i, expect; + void *dig_in = mdev->int_dig_in; + void *dig_vv = mdev->int_dig_vv; + + dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ? + crypto_hash_digestsize(mdev->integrity_r_tfm) : 0; - dgs = 0; - if (mdev->tconn->peer_integrity_tfm) { - dgs = crypto_hash_digestsize(mdev->tconn->peer_integrity_tfm); - err = drbd_recv_all_warn(mdev->tconn, dig_in, dgs); - if (err) - return err; - data_size -= dgs; + if (dgs) { + rr = drbd_recv(mdev, dig_in, dgs); + if (rr != dgs) { + if (!signal_pending(current)) + dev_warn(DEV, + "short read receiving data reply digest: read %d expected %d\n", + rr, dgs); + return 0; + } } + data_size -= dgs; + /* optimistically update recv_cnt. if receiving fails below, * we disconnect anyways, and counters will be reset. */ mdev->recv_cnt += data_size>>9; @@ -1616,61 +1399,63 @@ static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req, D_ASSERT(sector == bio->bi_sector); bio_for_each_segment(bvec, bio, i) { - void *mapped = kmap(bvec->bv_page) + bvec->bv_offset; expect = min_t(int, data_size, bvec->bv_len); - err = drbd_recv_all_warn(mdev->tconn, mapped, expect); + rr = drbd_recv(mdev, + kmap(bvec->bv_page)+bvec->bv_offset, + expect); kunmap(bvec->bv_page); - if (err) - return err; - data_size -= expect; + if (rr != expect) { + if (!signal_pending(current)) + dev_warn(DEV, "short read receiving data reply: " + "read %d expected %d\n", + rr, expect); + return 0; + } + data_size -= rr; } if (dgs) { - drbd_csum_bio(mdev, mdev->tconn->peer_integrity_tfm, bio, dig_vv); + drbd_csum_bio(mdev, mdev->integrity_r_tfm, bio, dig_vv); if (memcmp(dig_in, dig_vv, dgs)) { dev_err(DEV, "Digest integrity check FAILED. Broken NICs?\n"); - return -EINVAL; + return 0; } } D_ASSERT(data_size == 0); - return 0; + return 1; } -/* - * e_end_resync_block() is called in asender context via - * drbd_finish_peer_reqs(). - */ -static int e_end_resync_block(struct drbd_work *w, int unused) +/* e_end_resync_block() is called via + * drbd_process_done_ee() by asender only */ +static int e_end_resync_block(struct drbd_conf *mdev, struct drbd_work *w, int unused) { - struct drbd_peer_request *peer_req = - container_of(w, struct drbd_peer_request, w); - struct drbd_conf *mdev = w->mdev; - sector_t sector = peer_req->i.sector; - int err; + struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w; + sector_t sector = e->sector; + int ok; - D_ASSERT(drbd_interval_empty(&peer_req->i)); + D_ASSERT(hlist_unhashed(&e->collision)); - if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { - drbd_set_in_sync(mdev, sector, peer_req->i.size); - err = drbd_send_ack(mdev, P_RS_WRITE_ACK, peer_req); + if (likely((e->flags & EE_WAS_ERROR) == 0)) { + drbd_set_in_sync(mdev, sector, e->size); + ok = drbd_send_ack(mdev, P_RS_WRITE_ACK, e); } else { /* Record failure to sync */ - drbd_rs_failed_io(mdev, sector, peer_req->i.size); + drbd_rs_failed_io(mdev, sector, e->size); - err = drbd_send_ack(mdev, P_NEG_ACK, peer_req); + ok = drbd_send_ack(mdev, P_NEG_ACK, e); } dec_unacked(mdev); - return err; + return ok; } static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_size) __releases(local) { - struct drbd_peer_request *peer_req; + struct drbd_epoch_entry *e; - peer_req = read_in_block(mdev, ID_SYNCER, sector, data_size); - if (!peer_req) + e = read_in_block(mdev, ID_SYNCER, sector, data_size); + if (!e) goto fail; dec_rs_pending(mdev); @@ -1679,88 +1464,64 @@ static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_si /* corresponding dec_unacked() in e_end_resync_block() * respective _drbd_clear_done_ee */ - peer_req->w.cb = e_end_resync_block; + e->w.cb = e_end_resync_block; - spin_lock_irq(&mdev->tconn->req_lock); - list_add(&peer_req->w.list, &mdev->sync_ee); - spin_unlock_irq(&mdev->tconn->req_lock); + spin_lock_irq(&mdev->req_lock); + list_add(&e->w.list, &mdev->sync_ee); + spin_unlock_irq(&mdev->req_lock); atomic_add(data_size >> 9, &mdev->rs_sect_ev); - if (drbd_submit_peer_request(mdev, peer_req, WRITE, DRBD_FAULT_RS_WR) == 0) - return 0; + if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_RS_WR) == 0) + return true; /* don't care for the reason here */ dev_err(DEV, "submit failed, triggering re-connect\n"); - spin_lock_irq(&mdev->tconn->req_lock); - list_del(&peer_req->w.list); - spin_unlock_irq(&mdev->tconn->req_lock); + spin_lock_irq(&mdev->req_lock); + list_del(&e->w.list); + spin_unlock_irq(&mdev->req_lock); - drbd_free_peer_req(mdev, peer_req); + drbd_free_ee(mdev, e); fail: put_ldev(mdev); - return -EIO; + return false; } -static struct drbd_request * -find_request(struct drbd_conf *mdev, struct rb_root *root, u64 id, - sector_t sector, bool missing_ok, const char *func) +static int receive_DataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) { struct drbd_request *req; - - /* Request object according to our peer */ - req = (struct drbd_request *)(unsigned long)id; - if (drbd_contains_interval(root, sector, &req->i) && req->i.local) - return req; - if (!missing_ok) { - dev_err(DEV, "%s: failed to find request 0x%lx, sector %llus\n", func, - (unsigned long)id, (unsigned long long)sector); - } - return NULL; -} - -static int receive_DataReply(struct drbd_tconn *tconn, struct packet_info *pi) -{ - struct drbd_conf *mdev; - struct drbd_request *req; sector_t sector; - int err; - struct p_data *p = pi->data; - - mdev = vnr_to_mdev(tconn, pi->vnr); - if (!mdev) - return -EIO; + int ok; + struct p_data *p = &mdev->data.rbuf.data; sector = be64_to_cpu(p->sector); - spin_lock_irq(&mdev->tconn->req_lock); - req = find_request(mdev, &mdev->read_requests, p->block_id, sector, false, __func__); - spin_unlock_irq(&mdev->tconn->req_lock); - if (unlikely(!req)) - return -EIO; + spin_lock_irq(&mdev->req_lock); + req = _ar_id_to_req(mdev, p->block_id, sector); + spin_unlock_irq(&mdev->req_lock); + if (unlikely(!req)) { + dev_err(DEV, "Got a corrupt block_id/sector pair(1).\n"); + return false; + } /* hlist_del(&req->collision) is done in _req_may_be_done, to avoid * special casing it there for the various failure cases. * still no race with drbd_fail_pending_reads */ - err = recv_dless_read(mdev, req, sector, pi->size); - if (!err) - req_mod(req, DATA_RECEIVED); + ok = recv_dless_read(mdev, req, sector, data_size); + + if (ok) + req_mod(req, data_received); /* else: nothing. handled from drbd_disconnect... * I don't think we may complete this just yet * in case we are "on-disconnect: freeze" */ - return err; + return ok; } -static int receive_RSDataReply(struct drbd_tconn *tconn, struct packet_info *pi) +static int receive_RSDataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) { - struct drbd_conf *mdev; sector_t sector; - int err; - struct p_data *p = pi->data; - - mdev = vnr_to_mdev(tconn, pi->vnr); - if (!mdev) - return -EIO; + int ok; + struct p_data *p = &mdev->data.rbuf.data; sector = be64_to_cpu(p->sector); D_ASSERT(p->block_id == ID_SYNCER); @@ -1768,63 +1529,42 @@ static int receive_RSDataReply(struct drbd_tconn *tconn, struct packet_info *pi) if (get_ldev(mdev)) { /* data is submitted to disk within recv_resync_read. * corresponding put_ldev done below on error, - * or in drbd_peer_request_endio. */ - err = recv_resync_read(mdev, sector, pi->size); + * or in drbd_endio_write_sec. */ + ok = recv_resync_read(mdev, sector, data_size); } else { if (__ratelimit(&drbd_ratelimit_state)) dev_err(DEV, "Can not write resync data to local disk.\n"); - err = drbd_drain_block(mdev, pi->size); + ok = drbd_drain_block(mdev, data_size); - drbd_send_ack_dp(mdev, P_NEG_ACK, p, pi->size); + drbd_send_ack_dp(mdev, P_NEG_ACK, p, data_size); } - atomic_add(pi->size >> 9, &mdev->rs_sect_in); - - return err; -} - -static void restart_conflicting_writes(struct drbd_conf *mdev, - sector_t sector, int size) -{ - struct drbd_interval *i; - struct drbd_request *req; + atomic_add(data_size >> 9, &mdev->rs_sect_in); - drbd_for_each_overlap(i, &mdev->write_requests, sector, size) { - if (!i->local) - continue; - req = container_of(i, struct drbd_request, i); - if (req->rq_state & RQ_LOCAL_PENDING || - !(req->rq_state & RQ_POSTPONED)) - continue; - /* as it is RQ_POSTPONED, this will cause it to - * be queued on the retry workqueue. */ - __req_mod(req, CONFLICT_RESOLVED, NULL); - } + return ok; } -/* - * e_end_block() is called in asender context via drbd_finish_peer_reqs(). +/* e_end_block() is called via drbd_process_done_ee(). + * this means this function only runs in the asender thread */ -static int e_end_block(struct drbd_work *w, int cancel) +static int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int cancel) { - struct drbd_peer_request *peer_req = - container_of(w, struct drbd_peer_request, w); - struct drbd_conf *mdev = w->mdev; - sector_t sector = peer_req->i.sector; - int err = 0, pcmd; - - if (peer_req->flags & EE_SEND_WRITE_ACK) { - if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { + struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w; + sector_t sector = e->sector; + int ok = 1, pcmd; + + if (mdev->net_conf->wire_protocol == DRBD_PROT_C) { + if (likely((e->flags & EE_WAS_ERROR) == 0)) { pcmd = (mdev->state.conn >= C_SYNC_SOURCE && mdev->state.conn <= C_PAUSED_SYNC_T && - peer_req->flags & EE_MAY_SET_IN_SYNC) ? + e->flags & EE_MAY_SET_IN_SYNC) ? P_RS_WRITE_ACK : P_WRITE_ACK; - err = drbd_send_ack(mdev, pcmd, peer_req); + ok &= drbd_send_ack(mdev, pcmd, e); if (pcmd == P_RS_WRITE_ACK) - drbd_set_in_sync(mdev, sector, peer_req->i.size); + drbd_set_in_sync(mdev, sector, e->size); } else { - err = drbd_send_ack(mdev, P_NEG_ACK, peer_req); + ok = drbd_send_ack(mdev, P_NEG_ACK, e); /* we expect it to be marked out of sync anyways... * maybe assert this? */ } @@ -1832,115 +1572,52 @@ static int e_end_block(struct drbd_work *w, int cancel) } /* we delete from the conflict detection hash _after_ we sent out the * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right. */ - if (peer_req->flags & EE_IN_INTERVAL_TREE) { - spin_lock_irq(&mdev->tconn->req_lock); - D_ASSERT(!drbd_interval_empty(&peer_req->i)); - drbd_remove_epoch_entry_interval(mdev, peer_req); - if (peer_req->flags & EE_RESTART_REQUESTS) - restart_conflicting_writes(mdev, sector, peer_req->i.size); - spin_unlock_irq(&mdev->tconn->req_lock); - } else - D_ASSERT(drbd_interval_empty(&peer_req->i)); - - drbd_may_finish_epoch(mdev->tconn, peer_req->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0)); - - return err; -} - -static int e_send_ack(struct drbd_work *w, enum drbd_packet ack) -{ - struct drbd_conf *mdev = w->mdev; - struct drbd_peer_request *peer_req = - container_of(w, struct drbd_peer_request, w); - int err; - - err = drbd_send_ack(mdev, ack, peer_req); - dec_unacked(mdev); - - return err; -} - -static int e_send_superseded(struct drbd_work *w, int unused) -{ - return e_send_ack(w, P_SUPERSEDED); -} - -static int e_send_retry_write(struct drbd_work *w, int unused) -{ - struct drbd_tconn *tconn = w->mdev->tconn; - - return e_send_ack(w, tconn->agreed_pro_version >= 100 ? - P_RETRY_WRITE : P_SUPERSEDED); -} + if (mdev->net_conf->two_primaries) { + spin_lock_irq(&mdev->req_lock); + D_ASSERT(!hlist_unhashed(&e->collision)); + hlist_del_init(&e->collision); + spin_unlock_irq(&mdev->req_lock); + } else { + D_ASSERT(hlist_unhashed(&e->collision)); + } -static bool seq_greater(u32 a, u32 b) -{ - /* - * We assume 32-bit wrap-around here. - * For 24-bit wrap-around, we would have to shift: - * a <<= 8; b <<= 8; - */ - return (s32)a - (s32)b > 0; -} + drbd_may_finish_epoch(mdev, e->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0)); -static u32 seq_max(u32 a, u32 b) -{ - return seq_greater(a, b) ? a : b; + return ok; } -static bool need_peer_seq(struct drbd_conf *mdev) +static int e_send_discard_ack(struct drbd_conf *mdev, struct drbd_work *w, int unused) { - struct drbd_tconn *tconn = mdev->tconn; - int tp; - - /* - * We only need to keep track of the last packet_seq number of our peer - * if we are in dual-primary mode and we have the resolve-conflicts flag set; see - * handle_write_conflicts(). - */ + struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w; + int ok = 1; - rcu_read_lock(); - tp = rcu_dereference(mdev->tconn->net_conf)->two_primaries; - rcu_read_unlock(); + D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); + ok = drbd_send_ack(mdev, P_DISCARD_ACK, e); - return tp && test_bit(RESOLVE_CONFLICTS, &tconn->flags); -} + spin_lock_irq(&mdev->req_lock); + D_ASSERT(!hlist_unhashed(&e->collision)); + hlist_del_init(&e->collision); + spin_unlock_irq(&mdev->req_lock); -static void update_peer_seq(struct drbd_conf *mdev, unsigned int peer_seq) -{ - unsigned int newest_peer_seq; + dec_unacked(mdev); - if (need_peer_seq(mdev)) { - spin_lock(&mdev->peer_seq_lock); - newest_peer_seq = seq_max(mdev->peer_seq, peer_seq); - mdev->peer_seq = newest_peer_seq; - spin_unlock(&mdev->peer_seq_lock); - /* wake up only if we actually changed mdev->peer_seq */ - if (peer_seq == newest_peer_seq) - wake_up(&mdev->seq_wait); - } + return ok; } -static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2) +static bool overlapping_resync_write(struct drbd_conf *mdev, struct drbd_epoch_entry *data_e) { - return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9))); -} -/* maybe change sync_ee into interval trees as well? */ -static bool overlapping_resync_write(struct drbd_conf *mdev, struct drbd_peer_request *peer_req) -{ - struct drbd_peer_request *rs_req; + struct drbd_epoch_entry *rs_e; bool rv = 0; - spin_lock_irq(&mdev->tconn->req_lock); - list_for_each_entry(rs_req, &mdev->sync_ee, w.list) { - if (overlaps(peer_req->i.sector, peer_req->i.size, - rs_req->i.sector, rs_req->i.size)) { + spin_lock_irq(&mdev->req_lock); + list_for_each_entry(rs_e, &mdev->sync_ee, w.list) { + if (overlaps(data_e->sector, data_e->size, rs_e->sector, rs_e->size)) { rv = 1; break; } } - spin_unlock_irq(&mdev->tconn->req_lock); + spin_unlock_irq(&mdev->req_lock); return rv; } @@ -1966,41 +1643,35 @@ static bool overlapping_resync_write(struct drbd_conf *mdev, struct drbd_peer_re * * returns 0 if we may process the packet, * -ERESTARTSYS if we were interrupted (by disconnect signal). */ -static int wait_for_and_update_peer_seq(struct drbd_conf *mdev, const u32 peer_seq) +static int drbd_wait_peer_seq(struct drbd_conf *mdev, const u32 packet_seq) { DEFINE_WAIT(wait); + unsigned int p_seq; long timeout; - int ret; - - if (!need_peer_seq(mdev)) - return 0; - + int ret = 0; spin_lock(&mdev->peer_seq_lock); for (;;) { - if (!seq_greater(peer_seq - 1, mdev->peer_seq)) { - mdev->peer_seq = seq_max(mdev->peer_seq, peer_seq); - ret = 0; + prepare_to_wait(&mdev->seq_wait, &wait, TASK_INTERRUPTIBLE); + if (seq_le(packet_seq, mdev->peer_seq+1)) break; - } if (signal_pending(current)) { ret = -ERESTARTSYS; break; } - prepare_to_wait(&mdev->seq_wait, &wait, TASK_INTERRUPTIBLE); + p_seq = mdev->peer_seq; spin_unlock(&mdev->peer_seq_lock); - rcu_read_lock(); - timeout = rcu_dereference(mdev->tconn->net_conf)->ping_timeo*HZ/10; - rcu_read_unlock(); - timeout = schedule_timeout(timeout); + timeout = schedule_timeout(30*HZ); spin_lock(&mdev->peer_seq_lock); - if (!timeout) { + if (timeout == 0 && p_seq == mdev->peer_seq) { ret = -ETIMEDOUT; - dev_err(DEV, "Timed out waiting for missing ack packets; disconnecting\n"); + dev_err(DEV, "ASSERT FAILED waited 30 seconds for sequence update, forcing reconnect\n"); break; } } - spin_unlock(&mdev->peer_seq_lock); finish_wait(&mdev->seq_wait, &wait); + if (mdev->peer_seq+1 == packet_seq) + mdev->peer_seq++; + spin_unlock(&mdev->peer_seq_lock); return ret; } @@ -2015,277 +1686,233 @@ static unsigned long wire_flags_to_bio(struct drbd_conf *mdev, u32 dpf) (dpf & DP_DISCARD ? REQ_DISCARD : 0); } -static void fail_postponed_requests(struct drbd_conf *mdev, sector_t sector, - unsigned int size) -{ - struct drbd_interval *i; - - repeat: - drbd_for_each_overlap(i, &mdev->write_requests, sector, size) { - struct drbd_request *req; - struct bio_and_error m; - - if (!i->local) - continue; - req = container_of(i, struct drbd_request, i); - if (!(req->rq_state & RQ_POSTPONED)) - continue; - req->rq_state &= ~RQ_POSTPONED; - __req_mod(req, NEG_ACKED, &m); - spin_unlock_irq(&mdev->tconn->req_lock); - if (m.bio) - complete_master_bio(mdev, &m); - spin_lock_irq(&mdev->tconn->req_lock); - goto repeat; - } -} - -static int handle_write_conflicts(struct drbd_conf *mdev, - struct drbd_peer_request *peer_req) -{ - struct drbd_tconn *tconn = mdev->tconn; - bool resolve_conflicts = test_bit(RESOLVE_CONFLICTS, &tconn->flags); - sector_t sector = peer_req->i.sector; - const unsigned int size = peer_req->i.size; - struct drbd_interval *i; - bool equal; - int err; - - /* - * Inserting the peer request into the write_requests tree will prevent - * new conflicting local requests from being added. - */ - drbd_insert_interval(&mdev->write_requests, &peer_req->i); - - repeat: - drbd_for_each_overlap(i, &mdev->write_requests, sector, size) { - if (i == &peer_req->i) - continue; - - if (!i->local) { - /* - * Our peer has sent a conflicting remote request; this - * should not happen in a two-node setup. Wait for the - * earlier peer request to complete. - */ - err = drbd_wait_misc(mdev, i); - if (err) - goto out; - goto repeat; - } - - equal = i->sector == sector && i->size == size; - if (resolve_conflicts) { - /* - * If the peer request is fully contained within the - * overlapping request, it can be considered overwritten - * and thus superseded; otherwise, it will be retried - * once all overlapping requests have completed. - */ - bool superseded = i->sector <= sector && i->sector + - (i->size >> 9) >= sector + (size >> 9); - - if (!equal) - dev_alert(DEV, "Concurrent writes detected: " - "local=%llus +%u, remote=%llus +%u, " - "assuming %s came first\n", - (unsigned long long)i->sector, i->size, - (unsigned long long)sector, size, - superseded ? "local" : "remote"); - - inc_unacked(mdev); - peer_req->w.cb = superseded ? e_send_superseded : - e_send_retry_write; - list_add_tail(&peer_req->w.list, &mdev->done_ee); - wake_asender(mdev->tconn); - - err = -ENOENT; - goto out; - } else { - struct drbd_request *req = - container_of(i, struct drbd_request, i); - - if (!equal) - dev_alert(DEV, "Concurrent writes detected: " - "local=%llus +%u, remote=%llus +%u\n", - (unsigned long long)i->sector, i->size, - (unsigned long long)sector, size); - - if (req->rq_state & RQ_LOCAL_PENDING || - !(req->rq_state & RQ_POSTPONED)) { - /* - * Wait for the node with the discard flag to - * decide if this request has been superseded - * or needs to be retried. - * Requests that have been superseded will - * disappear from the write_requests tree. - * - * In addition, wait for the conflicting - * request to finish locally before submitting - * the conflicting peer request. - */ - err = drbd_wait_misc(mdev, &req->i); - if (err) { - _conn_request_state(mdev->tconn, - NS(conn, C_TIMEOUT), - CS_HARD); - fail_postponed_requests(mdev, sector, size); - goto out; - } - goto repeat; - } - /* - * Remember to restart the conflicting requests after - * the new peer request has completed. - */ - peer_req->flags |= EE_RESTART_REQUESTS; - } - } - err = 0; - - out: - if (err) - drbd_remove_epoch_entry_interval(mdev, peer_req); - return err; -} - /* mirrored write */ -static int receive_Data(struct drbd_tconn *tconn, struct packet_info *pi) +static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) { - struct drbd_conf *mdev; sector_t sector; - struct drbd_peer_request *peer_req; - struct p_data *p = pi->data; - u32 peer_seq = be32_to_cpu(p->seq_num); + struct drbd_epoch_entry *e; + struct p_data *p = &mdev->data.rbuf.data; int rw = WRITE; u32 dp_flags; - int err, tp; - - mdev = vnr_to_mdev(tconn, pi->vnr); - if (!mdev) - return -EIO; if (!get_ldev(mdev)) { - int err2; - - err = wait_for_and_update_peer_seq(mdev, peer_seq); - drbd_send_ack_dp(mdev, P_NEG_ACK, p, pi->size); - atomic_inc(&tconn->current_epoch->epoch_size); - err2 = drbd_drain_block(mdev, pi->size); - if (!err) - err = err2; - return err; + spin_lock(&mdev->peer_seq_lock); + if (mdev->peer_seq+1 == be32_to_cpu(p->seq_num)) + mdev->peer_seq++; + spin_unlock(&mdev->peer_seq_lock); + + drbd_send_ack_dp(mdev, P_NEG_ACK, p, data_size); + atomic_inc(&mdev->current_epoch->epoch_size); + return drbd_drain_block(mdev, data_size); } - /* - * Corresponding put_ldev done either below (on various errors), or in - * drbd_peer_request_endio, if we successfully submit the data at the - * end of this function. - */ + /* get_ldev(mdev) successful. + * Corresponding put_ldev done either below (on various errors), + * or in drbd_endio_write_sec, if we successfully submit the data at + * the end of this function. */ sector = be64_to_cpu(p->sector); - peer_req = read_in_block(mdev, p->block_id, sector, pi->size); - if (!peer_req) { + e = read_in_block(mdev, p->block_id, sector, data_size); + if (!e) { put_ldev(mdev); - return -EIO; + return false; } - peer_req->w.cb = e_end_block; + e->w.cb = e_end_block; dp_flags = be32_to_cpu(p->dp_flags); rw |= wire_flags_to_bio(mdev, dp_flags); - if (peer_req->pages == NULL) { - D_ASSERT(peer_req->i.size == 0); + if (e->pages == NULL) { + D_ASSERT(e->size == 0); D_ASSERT(dp_flags & DP_FLUSH); } if (dp_flags & DP_MAY_SET_IN_SYNC) - peer_req->flags |= EE_MAY_SET_IN_SYNC; - - spin_lock(&tconn->epoch_lock); - peer_req->epoch = tconn->current_epoch; - atomic_inc(&peer_req->epoch->epoch_size); - atomic_inc(&peer_req->epoch->active); - spin_unlock(&tconn->epoch_lock); - - rcu_read_lock(); - tp = rcu_dereference(mdev->tconn->net_conf)->two_primaries; - rcu_read_unlock(); - if (tp) { - peer_req->flags |= EE_IN_INTERVAL_TREE; - err = wait_for_and_update_peer_seq(mdev, peer_seq); - if (err) + e->flags |= EE_MAY_SET_IN_SYNC; + + spin_lock(&mdev->epoch_lock); + e->epoch = mdev->current_epoch; + atomic_inc(&e->epoch->epoch_size); + atomic_inc(&e->epoch->active); + spin_unlock(&mdev->epoch_lock); + + /* I'm the receiver, I do hold a net_cnt reference. */ + if (!mdev->net_conf->two_primaries) { + spin_lock_irq(&mdev->req_lock); + } else { + /* don't get the req_lock yet, + * we may sleep in drbd_wait_peer_seq */ + const int size = e->size; + const int discard = test_bit(DISCARD_CONCURRENT, &mdev->flags); + DEFINE_WAIT(wait); + struct drbd_request *i; + struct hlist_node *n; + struct hlist_head *slot; + int first; + + D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); + BUG_ON(mdev->ee_hash == NULL); + BUG_ON(mdev->tl_hash == NULL); + + /* conflict detection and handling: + * 1. wait on the sequence number, + * in case this data packet overtook ACK packets. + * 2. check our hash tables for conflicting requests. + * we only need to walk the tl_hash, since an ee can not + * have a conflict with an other ee: on the submitting + * node, the corresponding req had already been conflicting, + * and a conflicting req is never sent. + * + * Note: for two_primaries, we are protocol C, + * so there cannot be any request that is DONE + * but still on the transfer log. + * + * unconditionally add to the ee_hash. + * + * if no conflicting request is found: + * submit. + * + * if any conflicting request is found + * that has not yet been acked, + * AND I have the "discard concurrent writes" flag: + * queue (via done_ee) the P_DISCARD_ACK; OUT. + * + * if any conflicting request is found: + * block the receiver, waiting on misc_wait + * until no more conflicting requests are there, + * or we get interrupted (disconnect). + * + * we do not just write after local io completion of those + * requests, but only after req is done completely, i.e. + * we wait for the P_DISCARD_ACK to arrive! + * + * then proceed normally, i.e. submit. + */ + if (drbd_wait_peer_seq(mdev, be32_to_cpu(p->seq_num))) goto out_interrupted; - spin_lock_irq(&mdev->tconn->req_lock); - err = handle_write_conflicts(mdev, peer_req); - if (err) { - spin_unlock_irq(&mdev->tconn->req_lock); - if (err == -ENOENT) { + + spin_lock_irq(&mdev->req_lock); + + hlist_add_head(&e->collision, ee_hash_slot(mdev, sector)); + +#define OVERLAPS overlaps(i->sector, i->size, sector, size) + slot = tl_hash_slot(mdev, sector); + first = 1; + for (;;) { + int have_unacked = 0; + int have_conflict = 0; + prepare_to_wait(&mdev->misc_wait, &wait, + TASK_INTERRUPTIBLE); + hlist_for_each_entry(i, n, slot, collision) { + if (OVERLAPS) { + /* only ALERT on first iteration, + * we may be woken up early... */ + if (first) + dev_alert(DEV, "%s[%u] Concurrent local write detected!" + " new: %llus +%u; pending: %llus +%u\n", + current->comm, current->pid, + (unsigned long long)sector, size, + (unsigned long long)i->sector, i->size); + if (i->rq_state & RQ_NET_PENDING) + ++have_unacked; + ++have_conflict; + } + } +#undef OVERLAPS + if (!have_conflict) + break; + + /* Discard Ack only for the _first_ iteration */ + if (first && discard && have_unacked) { + dev_alert(DEV, "Concurrent write! [DISCARD BY FLAG] sec=%llus\n", + (unsigned long long)sector); + inc_unacked(mdev); + e->w.cb = e_send_discard_ack; + list_add_tail(&e->w.list, &mdev->done_ee); + + spin_unlock_irq(&mdev->req_lock); + + /* we could probably send that P_DISCARD_ACK ourselves, + * but I don't like the receiver using the msock */ + put_ldev(mdev); - return 0; + wake_asender(mdev); + finish_wait(&mdev->misc_wait, &wait); + return true; } - goto out_interrupted; - } - } else - spin_lock_irq(&mdev->tconn->req_lock); - list_add(&peer_req->w.list, &mdev->active_ee); - spin_unlock_irq(&mdev->tconn->req_lock); - if (mdev->state.conn == C_SYNC_TARGET) - wait_event(mdev->ee_wait, !overlapping_resync_write(mdev, peer_req)); + if (signal_pending(current)) { + hlist_del_init(&e->collision); - if (mdev->tconn->agreed_pro_version < 100) { - rcu_read_lock(); - switch (rcu_dereference(mdev->tconn->net_conf)->wire_protocol) { - case DRBD_PROT_C: - dp_flags |= DP_SEND_WRITE_ACK; - break; - case DRBD_PROT_B: - dp_flags |= DP_SEND_RECEIVE_ACK; - break; + spin_unlock_irq(&mdev->req_lock); + + finish_wait(&mdev->misc_wait, &wait); + goto out_interrupted; + } + + spin_unlock_irq(&mdev->req_lock); + if (first) { + first = 0; + dev_alert(DEV, "Concurrent write! [W AFTERWARDS] " + "sec=%llus\n", (unsigned long long)sector); + } else if (discard) { + /* we had none on the first iteration. + * there must be none now. */ + D_ASSERT(have_unacked == 0); + } + schedule(); + spin_lock_irq(&mdev->req_lock); } - rcu_read_unlock(); + finish_wait(&mdev->misc_wait, &wait); } - if (dp_flags & DP_SEND_WRITE_ACK) { - peer_req->flags |= EE_SEND_WRITE_ACK; + list_add(&e->w.list, &mdev->active_ee); + spin_unlock_irq(&mdev->req_lock); + + if (mdev->state.conn == C_SYNC_TARGET) + wait_event(mdev->ee_wait, !overlapping_resync_write(mdev, e)); + + switch (mdev->net_conf->wire_protocol) { + case DRBD_PROT_C: inc_unacked(mdev); /* corresponding dec_unacked() in e_end_block() * respective _drbd_clear_done_ee */ - } - - if (dp_flags & DP_SEND_RECEIVE_ACK) { + break; + case DRBD_PROT_B: /* I really don't like it that the receiver thread * sends on the msock, but anyways */ - drbd_send_ack(mdev, P_RECV_ACK, peer_req); + drbd_send_ack(mdev, P_RECV_ACK, e); + break; + case DRBD_PROT_A: + /* nothing to do */ + break; } if (mdev->state.pdsk < D_INCONSISTENT) { /* In case we have the only disk of the cluster, */ - drbd_set_out_of_sync(mdev, peer_req->i.sector, peer_req->i.size); - peer_req->flags |= EE_CALL_AL_COMPLETE_IO; - peer_req->flags &= ~EE_MAY_SET_IN_SYNC; - drbd_al_begin_io(mdev, &peer_req->i); + drbd_set_out_of_sync(mdev, e->sector, e->size); + e->flags |= EE_CALL_AL_COMPLETE_IO; + e->flags &= ~EE_MAY_SET_IN_SYNC; + drbd_al_begin_io(mdev, e->sector); } - err = drbd_submit_peer_request(mdev, peer_req, rw, DRBD_FAULT_DT_WR); - if (!err) - return 0; + if (drbd_submit_ee(mdev, e, rw, DRBD_FAULT_DT_WR) == 0) + return true; /* don't care for the reason here */ dev_err(DEV, "submit failed, triggering re-connect\n"); - spin_lock_irq(&mdev->tconn->req_lock); - list_del(&peer_req->w.list); - drbd_remove_epoch_entry_interval(mdev, peer_req); - spin_unlock_irq(&mdev->tconn->req_lock); - if (peer_req->flags & EE_CALL_AL_COMPLETE_IO) - drbd_al_complete_io(mdev, &peer_req->i); + spin_lock_irq(&mdev->req_lock); + list_del(&e->w.list); + hlist_del_init(&e->collision); + spin_unlock_irq(&mdev->req_lock); + if (e->flags & EE_CALL_AL_COMPLETE_IO) + drbd_al_complete_io(mdev, e->sector); out_interrupted: - drbd_may_finish_epoch(tconn, peer_req->epoch, EV_PUT + EV_CLEANUP); + drbd_may_finish_epoch(mdev, e->epoch, EV_PUT + EV_CLEANUP); put_ldev(mdev); - drbd_free_peer_req(mdev, peer_req); - return err; + drbd_free_ee(mdev, e); + return false; } /* We may throttle resync, if the lower device seems to be busy, @@ -2306,14 +1933,9 @@ int drbd_rs_should_slow_down(struct drbd_conf *mdev, sector_t sector) struct lc_element *tmp; int curr_events; int throttle = 0; - unsigned int c_min_rate; - - rcu_read_lock(); - c_min_rate = rcu_dereference(mdev->ldev->disk_conf)->c_min_rate; - rcu_read_unlock(); /* feature disabled? */ - if (c_min_rate == 0) + if (mdev->sync_conf.c_min_rate == 0) return 0; spin_lock_irq(&mdev->al_lock); @@ -2353,46 +1975,40 @@ int drbd_rs_should_slow_down(struct drbd_conf *mdev, sector_t sector) db = mdev->rs_mark_left[i] - rs_left; dbdt = Bit2KB(db/dt); - if (dbdt > c_min_rate) + if (dbdt > mdev->sync_conf.c_min_rate) throttle = 1; } return throttle; } -static int receive_DataRequest(struct drbd_tconn *tconn, struct packet_info *pi) +static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int digest_size) { - struct drbd_conf *mdev; sector_t sector; - sector_t capacity; - struct drbd_peer_request *peer_req; + const sector_t capacity = drbd_get_capacity(mdev->this_bdev); + struct drbd_epoch_entry *e; struct digest_info *di = NULL; int size, verb; unsigned int fault_type; - struct p_block_req *p = pi->data; - - mdev = vnr_to_mdev(tconn, pi->vnr); - if (!mdev) - return -EIO; - capacity = drbd_get_capacity(mdev->this_bdev); + struct p_block_req *p = &mdev->data.rbuf.block_req; sector = be64_to_cpu(p->sector); size = be32_to_cpu(p->blksize); - if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) { + if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) { dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__, (unsigned long long)sector, size); - return -EINVAL; + return false; } if (sector + (size>>9) > capacity) { dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__, (unsigned long long)sector, size); - return -EINVAL; + return false; } if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) { verb = 1; - switch (pi->cmd) { + switch (cmd) { case P_DATA_REQUEST: drbd_send_ack_rp(mdev, P_NEG_DREPLY, p); break; @@ -2407,34 +2023,35 @@ static int receive_DataRequest(struct drbd_tconn *tconn, struct packet_info *pi) drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size, ID_IN_SYNC); break; default: - BUG(); + dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n", + cmdname(cmd)); } if (verb && __ratelimit(&drbd_ratelimit_state)) dev_err(DEV, "Can not satisfy peer's read request, " "no local data.\n"); /* drain possibly payload */ - return drbd_drain_block(mdev, pi->size); + return drbd_drain_block(mdev, digest_size); } /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD * "criss-cross" setup, that might cause write-out on some other DRBD, * which in turn might block on the other node at this very place. */ - peer_req = drbd_alloc_peer_req(mdev, p->block_id, sector, size, GFP_NOIO); - if (!peer_req) { + e = drbd_alloc_ee(mdev, p->block_id, sector, size, GFP_NOIO); + if (!e) { put_ldev(mdev); - return -ENOMEM; + return false; } - switch (pi->cmd) { + switch (cmd) { case P_DATA_REQUEST: - peer_req->w.cb = w_e_end_data_req; + e->w.cb = w_e_end_data_req; fault_type = DRBD_FAULT_DT_RD; /* application IO, don't drbd_rs_begin_io */ goto submit; case P_RS_DATA_REQUEST: - peer_req->w.cb = w_e_end_rsdata_req; + e->w.cb = w_e_end_rsdata_req; fault_type = DRBD_FAULT_RS_RD; /* used in the sector offset progress display */ mdev->bm_resync_fo = BM_SECT_TO_BIT(sector); @@ -2443,28 +2060,28 @@ static int receive_DataRequest(struct drbd_tconn *tconn, struct packet_info *pi) case P_OV_REPLY: case P_CSUM_RS_REQUEST: fault_type = DRBD_FAULT_RS_RD; - di = kmalloc(sizeof(*di) + pi->size, GFP_NOIO); + di = kmalloc(sizeof(*di) + digest_size, GFP_NOIO); if (!di) goto out_free_e; - di->digest_size = pi->size; + di->digest_size = digest_size; di->digest = (((char *)di)+sizeof(struct digest_info)); - peer_req->digest = di; - peer_req->flags |= EE_HAS_DIGEST; + e->digest = di; + e->flags |= EE_HAS_DIGEST; - if (drbd_recv_all(mdev->tconn, di->digest, pi->size)) + if (drbd_recv(mdev, di->digest, digest_size) != digest_size) goto out_free_e; - if (pi->cmd == P_CSUM_RS_REQUEST) { - D_ASSERT(mdev->tconn->agreed_pro_version >= 89); - peer_req->w.cb = w_e_end_csum_rs_req; + if (cmd == P_CSUM_RS_REQUEST) { + D_ASSERT(mdev->agreed_pro_version >= 89); + e->w.cb = w_e_end_csum_rs_req; /* used in the sector offset progress display */ mdev->bm_resync_fo = BM_SECT_TO_BIT(sector); - } else if (pi->cmd == P_OV_REPLY) { + } else if (cmd == P_OV_REPLY) { /* track progress, we may need to throttle */ atomic_add(size >> 9, &mdev->rs_sect_in); - peer_req->w.cb = w_e_end_ov_reply; + e->w.cb = w_e_end_ov_reply; dec_rs_pending(mdev); /* drbd_rs_begin_io done when we sent this request, * but accounting still needs to be done. */ @@ -2474,7 +2091,7 @@ static int receive_DataRequest(struct drbd_tconn *tconn, struct packet_info *pi) case P_OV_REQUEST: if (mdev->ov_start_sector == ~(sector_t)0 && - mdev->tconn->agreed_pro_version >= 90) { + mdev->agreed_pro_version >= 90) { unsigned long now = jiffies; int i; mdev->ov_start_sector = sector; @@ -2488,12 +2105,15 @@ static int receive_DataRequest(struct drbd_tconn *tconn, struct packet_info *pi) dev_info(DEV, "Online Verify start sector: %llu\n", (unsigned long long)sector); } - peer_req->w.cb = w_e_end_ov_req; + e->w.cb = w_e_end_ov_req; fault_type = DRBD_FAULT_RS_RD; break; default: - BUG(); + dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n", + cmdname(cmd)); + fault_type = DRBD_FAULT_MAX; + goto out_free_e; } /* Throttle, drbd_rs_begin_io and submit should become asynchronous @@ -2528,31 +2148,30 @@ static int receive_DataRequest(struct drbd_tconn *tconn, struct packet_info *pi) submit: inc_unacked(mdev); - spin_lock_irq(&mdev->tconn->req_lock); - list_add_tail(&peer_req->w.list, &mdev->read_ee); - spin_unlock_irq(&mdev->tconn->req_lock); + spin_lock_irq(&mdev->req_lock); + list_add_tail(&e->w.list, &mdev->read_ee); + spin_unlock_irq(&mdev->req_lock); - if (drbd_submit_peer_request(mdev, peer_req, READ, fault_type) == 0) - return 0; + if (drbd_submit_ee(mdev, e, READ, fault_type) == 0) + return true; /* don't care for the reason here */ dev_err(DEV, "submit failed, triggering re-connect\n"); - spin_lock_irq(&mdev->tconn->req_lock); - list_del(&peer_req->w.list); - spin_unlock_irq(&mdev->tconn->req_lock); + spin_lock_irq(&mdev->req_lock); + list_del(&e->w.list); + spin_unlock_irq(&mdev->req_lock); /* no drbd_rs_complete_io(), we are dropping the connection anyways */ out_free_e: put_ldev(mdev); - drbd_free_peer_req(mdev, peer_req); - return -EIO; + drbd_free_ee(mdev, e); + return false; } static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local) { int self, peer, rv = -100; unsigned long ch_self, ch_peer; - enum drbd_after_sb_p after_sb_0p; self = mdev->ldev->md.uuid[UI_BITMAP] & 1; peer = mdev->p_uuid[UI_BITMAP] & 1; @@ -2560,14 +2179,10 @@ static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local) ch_peer = mdev->p_uuid[UI_SIZE]; ch_self = mdev->comm_bm_set; - rcu_read_lock(); - after_sb_0p = rcu_dereference(mdev->tconn->net_conf)->after_sb_0p; - rcu_read_unlock(); - switch (after_sb_0p) { + switch (mdev->net_conf->after_sb_0p) { case ASB_CONSENSUS: case ASB_DISCARD_SECONDARY: case ASB_CALL_HELPER: - case ASB_VIOLENTLY: dev_err(DEV, "Configuration error.\n"); break; case ASB_DISCONNECT: @@ -2596,14 +2211,14 @@ static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local) "Using discard-least-changes instead\n"); case ASB_DISCARD_ZERO_CHG: if (ch_peer == 0 && ch_self == 0) { - rv = test_bit(RESOLVE_CONFLICTS, &mdev->tconn->flags) + rv = test_bit(DISCARD_CONCURRENT, &mdev->flags) ? -1 : 1; break; } else { if (ch_peer == 0) { rv = 1; break; } if (ch_self == 0) { rv = -1; break; } } - if (after_sb_0p == ASB_DISCARD_ZERO_CHG) + if (mdev->net_conf->after_sb_0p == ASB_DISCARD_ZERO_CHG) break; case ASB_DISCARD_LEAST_CHG: if (ch_self < ch_peer) @@ -2612,7 +2227,7 @@ static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local) rv = 1; else /* ( ch_self == ch_peer ) */ /* Well, then use something else. */ - rv = test_bit(RESOLVE_CONFLICTS, &mdev->tconn->flags) + rv = test_bit(DISCARD_CONCURRENT, &mdev->flags) ? -1 : 1; break; case ASB_DISCARD_LOCAL: @@ -2628,18 +2243,13 @@ static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local) static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local) { int hg, rv = -100; - enum drbd_after_sb_p after_sb_1p; - rcu_read_lock(); - after_sb_1p = rcu_dereference(mdev->tconn->net_conf)->after_sb_1p; - rcu_read_unlock(); - switch (after_sb_1p) { + switch (mdev->net_conf->after_sb_1p) { case ASB_DISCARD_YOUNGER_PRI: case ASB_DISCARD_OLDER_PRI: case ASB_DISCARD_LEAST_CHG: case ASB_DISCARD_LOCAL: case ASB_DISCARD_REMOTE: - case ASB_DISCARD_ZERO_CHG: dev_err(DEV, "Configuration error.\n"); break; case ASB_DISCONNECT: @@ -2682,12 +2292,8 @@ static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local) static int drbd_asb_recover_2p(struct drbd_conf *mdev) __must_hold(local) { int hg, rv = -100; - enum drbd_after_sb_p after_sb_2p; - rcu_read_lock(); - after_sb_2p = rcu_dereference(mdev->tconn->net_conf)->after_sb_2p; - rcu_read_unlock(); - switch (after_sb_2p) { + switch (mdev->net_conf->after_sb_2p) { case ASB_DISCARD_YOUNGER_PRI: case ASB_DISCARD_OLDER_PRI: case ASB_DISCARD_LEAST_CHG: @@ -2695,7 +2301,6 @@ static int drbd_asb_recover_2p(struct drbd_conf *mdev) __must_hold(local) case ASB_DISCARD_REMOTE: case ASB_CONSENSUS: case ASB_DISCARD_SECONDARY: - case ASB_DISCARD_ZERO_CHG: dev_err(DEV, "Configuration error.\n"); break; case ASB_VIOLENTLY: @@ -2781,15 +2386,13 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l if (mdev->p_uuid[UI_BITMAP] == (u64)0 && mdev->ldev->md.uuid[UI_BITMAP] != (u64)0) { - if (mdev->tconn->agreed_pro_version < 91) + if (mdev->agreed_pro_version < 91) return -1091; if ((mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) && (mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1))) { dev_info(DEV, "was SyncSource, missed the resync finished event, corrected myself:\n"); - drbd_uuid_move_history(mdev); - mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP]; - mdev->ldev->md.uuid[UI_BITMAP] = 0; + drbd_uuid_set_bm(mdev, 0UL); drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid, mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0); @@ -2804,7 +2407,7 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l if (mdev->ldev->md.uuid[UI_BITMAP] == (u64)0 && mdev->p_uuid[UI_BITMAP] != (u64)0) { - if (mdev->tconn->agreed_pro_version < 91) + if (mdev->agreed_pro_version < 91) return -1091; if ((mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_BITMAP] & ~((u64)1)) && @@ -2837,7 +2440,7 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l case 1: /* self_pri && !peer_pri */ return 1; case 2: /* !self_pri && peer_pri */ return -1; case 3: /* self_pri && peer_pri */ - dc = test_bit(RESOLVE_CONFLICTS, &mdev->tconn->flags); + dc = test_bit(DISCARD_CONCURRENT, &mdev->flags); return dc ? -1 : 1; } } @@ -2850,14 +2453,14 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l *rule_nr = 51; peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1); if (self == peer) { - if (mdev->tconn->agreed_pro_version < 96 ? + if (mdev->agreed_pro_version < 96 ? (mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1)) : peer + UUID_NEW_BM_OFFSET == (mdev->p_uuid[UI_BITMAP] & ~((u64)1))) { /* The last P_SYNC_UUID did not get though. Undo the last start of resync as sync source modifications of the peer's UUIDs. */ - if (mdev->tconn->agreed_pro_version < 91) + if (mdev->agreed_pro_version < 91) return -1091; mdev->p_uuid[UI_BITMAP] = mdev->p_uuid[UI_HISTORY_START]; @@ -2887,18 +2490,18 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l *rule_nr = 71; self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1); if (self == peer) { - if (mdev->tconn->agreed_pro_version < 96 ? + if (mdev->agreed_pro_version < 96 ? (mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) : self + UUID_NEW_BM_OFFSET == (mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1))) { /* The last P_SYNC_UUID did not get though. Undo the last start of resync as sync source modifications of our UUIDs. */ - if (mdev->tconn->agreed_pro_version < 91) + if (mdev->agreed_pro_version < 91) return -1091; - __drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_HISTORY_START]); - __drbd_uuid_set(mdev, UI_HISTORY_START, mdev->ldev->md.uuid[UI_HISTORY_START + 1]); + _drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_HISTORY_START]); + _drbd_uuid_set(mdev, UI_HISTORY_START, mdev->ldev->md.uuid[UI_HISTORY_START + 1]); dev_info(DEV, "Last syncUUID did not get through, corrected:\n"); drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid, @@ -2942,24 +2545,20 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_role peer_role, enum drbd_disk_state peer_disk) __must_hold(local) { + int hg, rule_nr; enum drbd_conns rv = C_MASK; enum drbd_disk_state mydisk; - struct net_conf *nc; - int hg, rule_nr, rr_conflict, tentative; mydisk = mdev->state.disk; if (mydisk == D_NEGOTIATING) mydisk = mdev->new_state_tmp.disk; dev_info(DEV, "drbd_sync_handshake:\n"); - - spin_lock_irq(&mdev->ldev->md.uuid_lock); drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid, mdev->comm_bm_set, 0); drbd_uuid_dump(mdev, "peer", mdev->p_uuid, mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]); hg = drbd_uuid_compare(mdev, &rule_nr); - spin_unlock_irq(&mdev->ldev->md.uuid_lock); dev_info(DEV, "uuid_compare()=%d by rule %d\n", hg, rule_nr); @@ -2985,10 +2584,7 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol if (abs(hg) == 100) drbd_khelper(mdev, "initial-split-brain"); - rcu_read_lock(); - nc = rcu_dereference(mdev->tconn->net_conf); - - if (hg == 100 || (hg == -100 && nc->always_asbp)) { + if (hg == 100 || (hg == -100 && mdev->net_conf->always_asbp)) { int pcount = (mdev->state.role == R_PRIMARY) + (peer_role == R_PRIMARY); int forced = (hg == -100); @@ -3017,9 +2613,9 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol } if (hg == -100) { - if (test_bit(DISCARD_MY_DATA, &mdev->flags) && !(mdev->p_uuid[UI_FLAGS]&1)) + if (mdev->net_conf->want_lose && !(mdev->p_uuid[UI_FLAGS]&1)) hg = -1; - if (!test_bit(DISCARD_MY_DATA, &mdev->flags) && (mdev->p_uuid[UI_FLAGS]&1)) + if (!mdev->net_conf->want_lose && (mdev->p_uuid[UI_FLAGS]&1)) hg = 1; if (abs(hg) < 100) @@ -3027,9 +2623,6 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol "Sync from %s node\n", (hg < 0) ? "peer" : "this"); } - rr_conflict = nc->rr_conflict; - tentative = nc->tentative; - rcu_read_unlock(); if (hg == -100) { /* FIXME this log message is not correct if we end up here @@ -3048,7 +2641,7 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol if (hg < 0 && /* by intention we do not use mydisk here. */ mdev->state.role == R_PRIMARY && mdev->state.disk >= D_CONSISTENT) { - switch (rr_conflict) { + switch (mdev->net_conf->rr_conflict) { case ASB_CALL_HELPER: drbd_khelper(mdev, "pri-lost"); /* fall through */ @@ -3061,7 +2654,7 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol } } - if (tentative || test_bit(CONN_DRY_RUN, &mdev->tconn->flags)) { + if (mdev->net_conf->dry_run || test_bit(CONN_DRY_RUN, &mdev->flags)) { if (hg == 0) dev_info(DEV, "dry-run connect: No resync, would become Connected immediately.\n"); else @@ -3093,29 +2686,33 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol return rv; } -static enum drbd_after_sb_p convert_after_sb(enum drbd_after_sb_p peer) +/* returns 1 if invalid */ +static int cmp_after_sb(enum drbd_after_sb_p peer, enum drbd_after_sb_p self) { /* ASB_DISCARD_REMOTE - ASB_DISCARD_LOCAL is valid */ - if (peer == ASB_DISCARD_REMOTE) - return ASB_DISCARD_LOCAL; + if ((peer == ASB_DISCARD_REMOTE && self == ASB_DISCARD_LOCAL) || + (self == ASB_DISCARD_REMOTE && peer == ASB_DISCARD_LOCAL)) + return 0; /* any other things with ASB_DISCARD_REMOTE or ASB_DISCARD_LOCAL are invalid */ - if (peer == ASB_DISCARD_LOCAL) - return ASB_DISCARD_REMOTE; + if (peer == ASB_DISCARD_REMOTE || peer == ASB_DISCARD_LOCAL || + self == ASB_DISCARD_REMOTE || self == ASB_DISCARD_LOCAL) + return 1; /* everything else is valid if they are equal on both sides. */ - return peer; + if (peer == self) + return 0; + + /* everything es is invalid. */ + return 1; } -static int receive_protocol(struct drbd_tconn *tconn, struct packet_info *pi) +static int receive_protocol(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) { - struct p_protocol *p = pi->data; - enum drbd_after_sb_p p_after_sb_0p, p_after_sb_1p, p_after_sb_2p; - int p_proto, p_discard_my_data, p_two_primaries, cf; - struct net_conf *nc, *old_net_conf, *new_net_conf = NULL; - char integrity_alg[SHARED_SECRET_MAX] = ""; - struct crypto_hash *peer_integrity_tfm = NULL; - void *int_dig_in = NULL, *int_dig_vv = NULL; + struct p_protocol *p = &mdev->data.rbuf.protocol; + int p_proto, p_after_sb_0p, p_after_sb_1p, p_after_sb_2p; + int p_want_lose, p_two_primaries, cf; + char p_integrity_alg[SHARED_SECRET_MAX] = ""; p_proto = be32_to_cpu(p->protocol); p_after_sb_0p = be32_to_cpu(p->after_sb_0p); @@ -3123,138 +2720,63 @@ static int receive_protocol(struct drbd_tconn *tconn, struct packet_info *pi) p_after_sb_2p = be32_to_cpu(p->after_sb_2p); p_two_primaries = be32_to_cpu(p->two_primaries); cf = be32_to_cpu(p->conn_flags); - p_discard_my_data = cf & CF_DISCARD_MY_DATA; - - if (tconn->agreed_pro_version >= 87) { - int err; - - if (pi->size > sizeof(integrity_alg)) - return -EIO; - err = drbd_recv_all(tconn, integrity_alg, pi->size); - if (err) - return err; - integrity_alg[SHARED_SECRET_MAX - 1] = 0; - } - - if (pi->cmd != P_PROTOCOL_UPDATE) { - clear_bit(CONN_DRY_RUN, &tconn->flags); - - if (cf & CF_DRY_RUN) - set_bit(CONN_DRY_RUN, &tconn->flags); - - rcu_read_lock(); - nc = rcu_dereference(tconn->net_conf); - - if (p_proto != nc->wire_protocol) { - conn_err(tconn, "incompatible %s settings\n", "protocol"); - goto disconnect_rcu_unlock; - } - - if (convert_after_sb(p_after_sb_0p) != nc->after_sb_0p) { - conn_err(tconn, "incompatible %s settings\n", "after-sb-0pri"); - goto disconnect_rcu_unlock; - } - - if (convert_after_sb(p_after_sb_1p) != nc->after_sb_1p) { - conn_err(tconn, "incompatible %s settings\n", "after-sb-1pri"); - goto disconnect_rcu_unlock; - } - - if (convert_after_sb(p_after_sb_2p) != nc->after_sb_2p) { - conn_err(tconn, "incompatible %s settings\n", "after-sb-2pri"); - goto disconnect_rcu_unlock; - } - - if (p_discard_my_data && nc->discard_my_data) { - conn_err(tconn, "incompatible %s settings\n", "discard-my-data"); - goto disconnect_rcu_unlock; - } + p_want_lose = cf & CF_WANT_LOSE; - if (p_two_primaries != nc->two_primaries) { - conn_err(tconn, "incompatible %s settings\n", "allow-two-primaries"); - goto disconnect_rcu_unlock; - } + clear_bit(CONN_DRY_RUN, &mdev->flags); - if (strcmp(integrity_alg, nc->integrity_alg)) { - conn_err(tconn, "incompatible %s settings\n", "data-integrity-alg"); - goto disconnect_rcu_unlock; - } + if (cf & CF_DRY_RUN) + set_bit(CONN_DRY_RUN, &mdev->flags); - rcu_read_unlock(); + if (p_proto != mdev->net_conf->wire_protocol) { + dev_err(DEV, "incompatible communication protocols\n"); + goto disconnect; } - if (integrity_alg[0]) { - int hash_size; - - /* - * We can only change the peer data integrity algorithm - * here. Changing our own data integrity algorithm - * requires that we send a P_PROTOCOL_UPDATE packet at - * the same time; otherwise, the peer has no way to - * tell between which packets the algorithm should - * change. - */ - - peer_integrity_tfm = crypto_alloc_hash(integrity_alg, 0, CRYPTO_ALG_ASYNC); - if (!peer_integrity_tfm) { - conn_err(tconn, "peer data-integrity-alg %s not supported\n", - integrity_alg); - goto disconnect; - } + if (cmp_after_sb(p_after_sb_0p, mdev->net_conf->after_sb_0p)) { + dev_err(DEV, "incompatible after-sb-0pri settings\n"); + goto disconnect; + } - hash_size = crypto_hash_digestsize(peer_integrity_tfm); - int_dig_in = kmalloc(hash_size, GFP_KERNEL); - int_dig_vv = kmalloc(hash_size, GFP_KERNEL); - if (!(int_dig_in && int_dig_vv)) { - conn_err(tconn, "Allocation of buffers for data integrity checking failed\n"); - goto disconnect; - } + if (cmp_after_sb(p_after_sb_1p, mdev->net_conf->after_sb_1p)) { + dev_err(DEV, "incompatible after-sb-1pri settings\n"); + goto disconnect; } - new_net_conf = kmalloc(sizeof(struct net_conf), GFP_KERNEL); - if (!new_net_conf) { - conn_err(tconn, "Allocation of new net_conf failed\n"); + if (cmp_after_sb(p_after_sb_2p, mdev->net_conf->after_sb_2p)) { + dev_err(DEV, "incompatible after-sb-2pri settings\n"); goto disconnect; } - mutex_lock(&tconn->data.mutex); - mutex_lock(&tconn->conf_update); - old_net_conf = tconn->net_conf; - *new_net_conf = *old_net_conf; + if (p_want_lose && mdev->net_conf->want_lose) { + dev_err(DEV, "both sides have the 'want_lose' flag set\n"); + goto disconnect; + } - new_net_conf->wire_protocol = p_proto; - new_net_conf->after_sb_0p = convert_after_sb(p_after_sb_0p); - new_net_conf->after_sb_1p = convert_after_sb(p_after_sb_1p); - new_net_conf->after_sb_2p = convert_after_sb(p_after_sb_2p); - new_net_conf->two_primaries = p_two_primaries; + if (p_two_primaries != mdev->net_conf->two_primaries) { + dev_err(DEV, "incompatible setting of the two-primaries options\n"); + goto disconnect; + } - rcu_assign_pointer(tconn->net_conf, new_net_conf); - mutex_unlock(&tconn->conf_update); - mutex_unlock(&tconn->data.mutex); + if (mdev->agreed_pro_version >= 87) { + unsigned char *my_alg = mdev->net_conf->integrity_alg; - crypto_free_hash(tconn->peer_integrity_tfm); - kfree(tconn->int_dig_in); - kfree(tconn->int_dig_vv); - tconn->peer_integrity_tfm = peer_integrity_tfm; - tconn->int_dig_in = int_dig_in; - tconn->int_dig_vv = int_dig_vv; + if (drbd_recv(mdev, p_integrity_alg, data_size) != data_size) + return false; - if (strcmp(old_net_conf->integrity_alg, integrity_alg)) - conn_info(tconn, "peer data-integrity-alg: %s\n", - integrity_alg[0] ? integrity_alg : "(none)"); + p_integrity_alg[SHARED_SECRET_MAX-1] = 0; + if (strcmp(p_integrity_alg, my_alg)) { + dev_err(DEV, "incompatible setting of the data-integrity-alg\n"); + goto disconnect; + } + dev_info(DEV, "data-integrity-alg: %s\n", + my_alg[0] ? my_alg : (unsigned char *)""); + } - synchronize_rcu(); - kfree(old_net_conf); - return 0; + return true; -disconnect_rcu_unlock: - rcu_read_unlock(); disconnect: - crypto_free_hash(peer_integrity_tfm); - kfree(int_dig_in); - kfree(int_dig_vv); - conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD); - return -EIO; + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); + return false; } /* helper function @@ -3276,64 +2798,24 @@ struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_conf *mdev, alg, name, PTR_ERR(tfm)); return tfm; } - return tfm; -} - -static int ignore_remaining_packet(struct drbd_tconn *tconn, struct packet_info *pi) -{ - void *buffer = tconn->data.rbuf; - int size = pi->size; - - while (size) { - int s = min_t(int, size, DRBD_SOCKET_BUFFER_SIZE); - s = drbd_recv(tconn, buffer, s); - if (s <= 0) { - if (s < 0) - return s; - break; - } - size -= s; + if (!drbd_crypto_is_hash(crypto_hash_tfm(tfm))) { + crypto_free_hash(tfm); + dev_err(DEV, "\"%s\" is not a digest (%s)\n", alg, name); + return ERR_PTR(-EINVAL); } - if (size) - return -EIO; - return 0; -} - -/* - * config_unknown_volume - device configuration command for unknown volume - * - * When a device is added to an existing connection, the node on which the - * device is added first will send configuration commands to its peer but the - * peer will not know about the device yet. It will warn and ignore these - * commands. Once the device is added on the second node, the second node will - * send the same device configuration commands, but in the other direction. - * - * (We can also end up here if drbd is misconfigured.) - */ -static int config_unknown_volume(struct drbd_tconn *tconn, struct packet_info *pi) -{ - conn_warn(tconn, "%s packet received for volume %u, which is not configured locally\n", - cmdname(pi->cmd), pi->vnr); - return ignore_remaining_packet(tconn, pi); + return tfm; } -static int receive_SyncParam(struct drbd_tconn *tconn, struct packet_info *pi) +static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int packet_size) { - struct drbd_conf *mdev; - struct p_rs_param_95 *p; + int ok = true; + struct p_rs_param_95 *p = &mdev->data.rbuf.rs_param_95; unsigned int header_size, data_size, exp_max_sz; struct crypto_hash *verify_tfm = NULL; struct crypto_hash *csums_tfm = NULL; - struct net_conf *old_net_conf, *new_net_conf = NULL; - struct disk_conf *old_disk_conf = NULL, *new_disk_conf = NULL; - const int apv = tconn->agreed_pro_version; - struct fifo_buffer *old_plan = NULL, *new_plan = NULL; + const int apv = mdev->agreed_pro_version; + int *rs_plan_s = NULL; int fifo_size = 0; - int err; - - mdev = vnr_to_mdev(tconn, pi->vnr); - if (!mdev) - return config_unknown_volume(tconn, pi); exp_max_sz = apv <= 87 ? sizeof(struct p_rs_param) : apv == 88 ? sizeof(struct p_rs_param) @@ -3341,49 +2823,32 @@ static int receive_SyncParam(struct drbd_tconn *tconn, struct packet_info *pi) : apv <= 94 ? sizeof(struct p_rs_param_89) : /* apv >= 95 */ sizeof(struct p_rs_param_95); - if (pi->size > exp_max_sz) { + if (packet_size > exp_max_sz) { dev_err(DEV, "SyncParam packet too long: received %u, expected <= %u bytes\n", - pi->size, exp_max_sz); - return -EIO; + packet_size, exp_max_sz); + return false; } if (apv <= 88) { - header_size = sizeof(struct p_rs_param); - data_size = pi->size - header_size; + header_size = sizeof(struct p_rs_param) - sizeof(struct p_header80); + data_size = packet_size - header_size; } else if (apv <= 94) { - header_size = sizeof(struct p_rs_param_89); - data_size = pi->size - header_size; + header_size = sizeof(struct p_rs_param_89) - sizeof(struct p_header80); + data_size = packet_size - header_size; D_ASSERT(data_size == 0); } else { - header_size = sizeof(struct p_rs_param_95); - data_size = pi->size - header_size; + header_size = sizeof(struct p_rs_param_95) - sizeof(struct p_header80); + data_size = packet_size - header_size; D_ASSERT(data_size == 0); } /* initialize verify_alg and csums_alg */ - p = pi->data; memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX); - err = drbd_recv_all(mdev->tconn, p, header_size); - if (err) - return err; - - mutex_lock(&mdev->tconn->conf_update); - old_net_conf = mdev->tconn->net_conf; - if (get_ldev(mdev)) { - new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL); - if (!new_disk_conf) { - put_ldev(mdev); - mutex_unlock(&mdev->tconn->conf_update); - dev_err(DEV, "Allocation of new disk_conf failed\n"); - return -ENOMEM; - } - - old_disk_conf = mdev->ldev->disk_conf; - *new_disk_conf = *old_disk_conf; + if (drbd_recv(mdev, &p->head.payload, header_size) != header_size) + return false; - new_disk_conf->resync_rate = be32_to_cpu(p->resync_rate); - } + mdev->sync_conf.rate = be32_to_cpu(p->rate); if (apv >= 88) { if (apv == 88) { @@ -3391,13 +2856,12 @@ static int receive_SyncParam(struct drbd_tconn *tconn, struct packet_info *pi) dev_err(DEV, "verify-alg of wrong size, " "peer wants %u, accepting only up to %u byte\n", data_size, SHARED_SECRET_MAX); - err = -EIO; - goto reconnect; + return false; } - err = drbd_recv_all(mdev->tconn, p->verify_alg, data_size); - if (err) - goto reconnect; + if (drbd_recv(mdev, p->verify_alg, data_size) != data_size) + return false; + /* we expect NUL terminated string */ /* but just in case someone tries to be evil */ D_ASSERT(p->verify_alg[data_size-1] == 0); @@ -3412,10 +2876,10 @@ static int receive_SyncParam(struct drbd_tconn *tconn, struct packet_info *pi) p->csums_alg[SHARED_SECRET_MAX-1] = 0; } - if (strcmp(old_net_conf->verify_alg, p->verify_alg)) { + if (strcmp(mdev->sync_conf.verify_alg, p->verify_alg)) { if (mdev->state.conn == C_WF_REPORT_PARAMS) { dev_err(DEV, "Different verify-alg settings. me=\"%s\" peer=\"%s\"\n", - old_net_conf->verify_alg, p->verify_alg); + mdev->sync_conf.verify_alg, p->verify_alg); goto disconnect; } verify_tfm = drbd_crypto_alloc_digest_safe(mdev, @@ -3426,10 +2890,10 @@ static int receive_SyncParam(struct drbd_tconn *tconn, struct packet_info *pi) } } - if (apv >= 89 && strcmp(old_net_conf->csums_alg, p->csums_alg)) { + if (apv >= 89 && strcmp(mdev->sync_conf.csums_alg, p->csums_alg)) { if (mdev->state.conn == C_WF_REPORT_PARAMS) { dev_err(DEV, "Different csums-alg settings. me=\"%s\" peer=\"%s\"\n", - old_net_conf->csums_alg, p->csums_alg); + mdev->sync_conf.csums_alg, p->csums_alg); goto disconnect; } csums_tfm = drbd_crypto_alloc_digest_safe(mdev, @@ -3440,91 +2904,57 @@ static int receive_SyncParam(struct drbd_tconn *tconn, struct packet_info *pi) } } - if (apv > 94 && new_disk_conf) { - new_disk_conf->c_plan_ahead = be32_to_cpu(p->c_plan_ahead); - new_disk_conf->c_delay_target = be32_to_cpu(p->c_delay_target); - new_disk_conf->c_fill_target = be32_to_cpu(p->c_fill_target); - new_disk_conf->c_max_rate = be32_to_cpu(p->c_max_rate); - - fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ; - if (fifo_size != mdev->rs_plan_s->size) { - new_plan = fifo_alloc(fifo_size); - if (!new_plan) { + if (apv > 94) { + mdev->sync_conf.rate = be32_to_cpu(p->rate); + mdev->sync_conf.c_plan_ahead = be32_to_cpu(p->c_plan_ahead); + mdev->sync_conf.c_delay_target = be32_to_cpu(p->c_delay_target); + mdev->sync_conf.c_fill_target = be32_to_cpu(p->c_fill_target); + mdev->sync_conf.c_max_rate = be32_to_cpu(p->c_max_rate); + + fifo_size = (mdev->sync_conf.c_plan_ahead * 10 * SLEEP_TIME) / HZ; + if (fifo_size != mdev->rs_plan_s.size && fifo_size > 0) { + rs_plan_s = kzalloc(sizeof(int) * fifo_size, GFP_KERNEL); + if (!rs_plan_s) { dev_err(DEV, "kmalloc of fifo_buffer failed"); - put_ldev(mdev); goto disconnect; } } } - if (verify_tfm || csums_tfm) { - new_net_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL); - if (!new_net_conf) { - dev_err(DEV, "Allocation of new net_conf failed\n"); - goto disconnect; - } - - *new_net_conf = *old_net_conf; - - if (verify_tfm) { - strcpy(new_net_conf->verify_alg, p->verify_alg); - new_net_conf->verify_alg_len = strlen(p->verify_alg) + 1; - crypto_free_hash(mdev->tconn->verify_tfm); - mdev->tconn->verify_tfm = verify_tfm; - dev_info(DEV, "using verify-alg: \"%s\"\n", p->verify_alg); - } - if (csums_tfm) { - strcpy(new_net_conf->csums_alg, p->csums_alg); - new_net_conf->csums_alg_len = strlen(p->csums_alg) + 1; - crypto_free_hash(mdev->tconn->csums_tfm); - mdev->tconn->csums_tfm = csums_tfm; - dev_info(DEV, "using csums-alg: \"%s\"\n", p->csums_alg); - } - rcu_assign_pointer(tconn->net_conf, new_net_conf); + spin_lock(&mdev->peer_seq_lock); + /* lock against drbd_nl_syncer_conf() */ + if (verify_tfm) { + strcpy(mdev->sync_conf.verify_alg, p->verify_alg); + mdev->sync_conf.verify_alg_len = strlen(p->verify_alg) + 1; + crypto_free_hash(mdev->verify_tfm); + mdev->verify_tfm = verify_tfm; + dev_info(DEV, "using verify-alg: \"%s\"\n", p->verify_alg); } + if (csums_tfm) { + strcpy(mdev->sync_conf.csums_alg, p->csums_alg); + mdev->sync_conf.csums_alg_len = strlen(p->csums_alg) + 1; + crypto_free_hash(mdev->csums_tfm); + mdev->csums_tfm = csums_tfm; + dev_info(DEV, "using csums-alg: \"%s\"\n", p->csums_alg); + } + if (fifo_size != mdev->rs_plan_s.size) { + kfree(mdev->rs_plan_s.values); + mdev->rs_plan_s.values = rs_plan_s; + mdev->rs_plan_s.size = fifo_size; + mdev->rs_planed = 0; + } + spin_unlock(&mdev->peer_seq_lock); } - if (new_disk_conf) { - rcu_assign_pointer(mdev->ldev->disk_conf, new_disk_conf); - put_ldev(mdev); - } - - if (new_plan) { - old_plan = mdev->rs_plan_s; - rcu_assign_pointer(mdev->rs_plan_s, new_plan); - } - - mutex_unlock(&mdev->tconn->conf_update); - synchronize_rcu(); - if (new_net_conf) - kfree(old_net_conf); - kfree(old_disk_conf); - kfree(old_plan); - - return 0; - -reconnect: - if (new_disk_conf) { - put_ldev(mdev); - kfree(new_disk_conf); - } - mutex_unlock(&mdev->tconn->conf_update); - return -EIO; - + return ok; disconnect: - kfree(new_plan); - if (new_disk_conf) { - put_ldev(mdev); - kfree(new_disk_conf); - } - mutex_unlock(&mdev->tconn->conf_update); /* just for completeness: actually not needed, * as this is not reached if csums_tfm was ok. */ crypto_free_hash(csums_tfm); /* but free the verify_tfm again, if csums_tfm did not work out */ crypto_free_hash(verify_tfm); - conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD); - return -EIO; + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); + return false; } /* warn if the arguments differ by more than 12.5% */ @@ -3540,77 +2970,59 @@ static void warn_if_differ_considerably(struct drbd_conf *mdev, (unsigned long long)a, (unsigned long long)b); } -static int receive_sizes(struct drbd_tconn *tconn, struct packet_info *pi) +static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) { - struct drbd_conf *mdev; - struct p_sizes *p = pi->data; + struct p_sizes *p = &mdev->data.rbuf.sizes; enum determine_dev_size dd = unchanged; sector_t p_size, p_usize, my_usize; int ldsc = 0; /* local disk size changed */ enum dds_flags ddsf; - mdev = vnr_to_mdev(tconn, pi->vnr); - if (!mdev) - return config_unknown_volume(tconn, pi); - p_size = be64_to_cpu(p->d_size); p_usize = be64_to_cpu(p->u_size); + if (p_size == 0 && mdev->state.disk == D_DISKLESS) { + dev_err(DEV, "some backing storage is needed\n"); + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); + return false; + } + /* just store the peer's disk size for now. * we still need to figure out whether we accept that. */ mdev->p_size = p_size; if (get_ldev(mdev)) { - rcu_read_lock(); - my_usize = rcu_dereference(mdev->ldev->disk_conf)->disk_size; - rcu_read_unlock(); - warn_if_differ_considerably(mdev, "lower level device sizes", p_size, drbd_get_max_capacity(mdev->ldev)); warn_if_differ_considerably(mdev, "user requested size", - p_usize, my_usize); + p_usize, mdev->ldev->dc.disk_size); /* if this is the first connect, or an otherwise expected * param exchange, choose the minimum */ if (mdev->state.conn == C_WF_REPORT_PARAMS) - p_usize = min_not_zero(my_usize, p_usize); + p_usize = min_not_zero((sector_t)mdev->ldev->dc.disk_size, + p_usize); + + my_usize = mdev->ldev->dc.disk_size; + + if (mdev->ldev->dc.disk_size != p_usize) { + mdev->ldev->dc.disk_size = p_usize; + dev_info(DEV, "Peer sets u_size to %lu sectors\n", + (unsigned long)mdev->ldev->dc.disk_size); + } /* Never shrink a device with usable data during connect. But allow online shrinking if we are connected. */ - if (drbd_new_dev_size(mdev, mdev->ldev, p_usize, 0) < - drbd_get_capacity(mdev->this_bdev) && - mdev->state.disk >= D_OUTDATED && - mdev->state.conn < C_CONNECTED) { + if (drbd_new_dev_size(mdev, mdev->ldev, 0) < + drbd_get_capacity(mdev->this_bdev) && + mdev->state.disk >= D_OUTDATED && + mdev->state.conn < C_CONNECTED) { dev_err(DEV, "The peer's disk size is too small!\n"); - conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD); + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); + mdev->ldev->dc.disk_size = my_usize; put_ldev(mdev); - return -EIO; - } - - if (my_usize != p_usize) { - struct disk_conf *old_disk_conf, *new_disk_conf = NULL; - - new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL); - if (!new_disk_conf) { - dev_err(DEV, "Allocation of new disk_conf failed\n"); - put_ldev(mdev); - return -ENOMEM; - } - - mutex_lock(&mdev->tconn->conf_update); - old_disk_conf = mdev->ldev->disk_conf; - *new_disk_conf = *old_disk_conf; - new_disk_conf->disk_size = p_usize; - - rcu_assign_pointer(mdev->ldev->disk_conf, new_disk_conf); - mutex_unlock(&mdev->tconn->conf_update); - synchronize_rcu(); - kfree(old_disk_conf); - - dev_info(DEV, "Peer sets u_size to %lu sectors\n", - (unsigned long)my_usize); + return false; } - put_ldev(mdev); } @@ -3619,7 +3031,7 @@ static int receive_sizes(struct drbd_tconn *tconn, struct packet_info *pi) dd = drbd_determine_dev_size(mdev, ddsf); put_ldev(mdev); if (dd == dev_size_error) - return -EIO; + return false; drbd_md_sync(mdev); } else { /* I am diskless, need to accept the peer's size. */ @@ -3658,25 +3070,16 @@ static int receive_sizes(struct drbd_tconn *tconn, struct packet_info *pi) } } - return 0; + return true; } -static int receive_uuids(struct drbd_tconn *tconn, struct packet_info *pi) +static int receive_uuids(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) { - struct drbd_conf *mdev; - struct p_uuids *p = pi->data; + struct p_uuids *p = &mdev->data.rbuf.uuids; u64 *p_uuid; int i, updated_uuids = 0; - mdev = vnr_to_mdev(tconn, pi->vnr); - if (!mdev) - return config_unknown_volume(tconn, pi); - p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO); - if (!p_uuid) { - dev_err(DEV, "kmalloc of p_uuid failed\n"); - return false; - } for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++) p_uuid[i] = be64_to_cpu(p->uuid[i]); @@ -3690,14 +3093,14 @@ static int receive_uuids(struct drbd_tconn *tconn, struct packet_info *pi) (mdev->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) { dev_err(DEV, "Can only connect to data with current UUID=%016llX\n", (unsigned long long)mdev->ed_uuid); - conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD); - return -EIO; + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); + return false; } if (get_ldev(mdev)) { int skip_initial_sync = mdev->state.conn == C_CONNECTED && - mdev->tconn->agreed_pro_version >= 90 && + mdev->agreed_pro_version >= 90 && mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && (p_uuid[UI_FLAGS] & 8); if (skip_initial_sync) { @@ -3724,15 +3127,14 @@ static int receive_uuids(struct drbd_tconn *tconn, struct packet_info *pi) ongoing cluster wide state change is finished. That is important if we are primary and are detaching from our disk. We need to see the new disk state... */ - mutex_lock(mdev->state_mutex); - mutex_unlock(mdev->state_mutex); + wait_event(mdev->misc_wait, !test_bit(CLUSTER_ST_CHANGE, &mdev->flags)); if (mdev->state.conn >= C_CONNECTED && mdev->state.disk < D_INCONSISTENT) updated_uuids |= drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]); if (updated_uuids) drbd_print_uuids(mdev, "receiver updated UUIDs to"); - return 0; + return true; } /** @@ -3744,7 +3146,6 @@ static union drbd_state convert_state(union drbd_state ps) union drbd_state ms; static enum drbd_conns c_tab[] = { - [C_WF_REPORT_PARAMS] = C_WF_REPORT_PARAMS, [C_CONNECTED] = C_CONNECTED, [C_STARTING_SYNC_S] = C_STARTING_SYNC_T, @@ -3759,81 +3160,47 @@ static union drbd_state convert_state(union drbd_state ps) ms.conn = c_tab[ps.conn]; ms.peer = ps.role; ms.role = ps.peer; - ms.pdsk = ps.disk; - ms.disk = ps.pdsk; - ms.peer_isp = (ps.aftr_isp | ps.user_isp); - - return ms; -} - -static int receive_req_state(struct drbd_tconn *tconn, struct packet_info *pi) -{ - struct drbd_conf *mdev; - struct p_req_state *p = pi->data; - union drbd_state mask, val; - enum drbd_state_rv rv; - - mdev = vnr_to_mdev(tconn, pi->vnr); - if (!mdev) - return -EIO; - - mask.i = be32_to_cpu(p->mask); - val.i = be32_to_cpu(p->val); - - if (test_bit(RESOLVE_CONFLICTS, &mdev->tconn->flags) && - mutex_is_locked(mdev->state_mutex)) { - drbd_send_sr_reply(mdev, SS_CONCURRENT_ST_CHG); - return 0; - } - - mask = convert_state(mask); - val = convert_state(val); - - rv = drbd_change_state(mdev, CS_VERBOSE, mask, val); - drbd_send_sr_reply(mdev, rv); - - drbd_md_sync(mdev); + ms.pdsk = ps.disk; + ms.disk = ps.pdsk; + ms.peer_isp = (ps.aftr_isp | ps.user_isp); - return 0; + return ms; } -static int receive_req_conn_state(struct drbd_tconn *tconn, struct packet_info *pi) +static int receive_req_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) { - struct p_req_state *p = pi->data; + struct p_req_state *p = &mdev->data.rbuf.req_state; union drbd_state mask, val; enum drbd_state_rv rv; mask.i = be32_to_cpu(p->mask); val.i = be32_to_cpu(p->val); - if (test_bit(RESOLVE_CONFLICTS, &tconn->flags) && - mutex_is_locked(&tconn->cstate_mutex)) { - conn_send_sr_reply(tconn, SS_CONCURRENT_ST_CHG); - return 0; + if (test_bit(DISCARD_CONCURRENT, &mdev->flags) && + test_bit(CLUSTER_ST_CHANGE, &mdev->flags)) { + drbd_send_sr_reply(mdev, SS_CONCURRENT_ST_CHG); + return true; } mask = convert_state(mask); val = convert_state(val); - rv = conn_request_state(tconn, mask, val, CS_VERBOSE | CS_LOCAL_ONLY | CS_IGN_OUTD_FAIL); - conn_send_sr_reply(tconn, rv); + rv = drbd_change_state(mdev, CS_VERBOSE, mask, val); - return 0; + drbd_send_sr_reply(mdev, rv); + drbd_md_sync(mdev); + + return true; } -static int receive_state(struct drbd_tconn *tconn, struct packet_info *pi) +static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) { - struct drbd_conf *mdev; - struct p_state *p = pi->data; + struct p_state *p = &mdev->data.rbuf.state; union drbd_state os, ns, peer_state; enum drbd_disk_state real_peer_disk; enum chg_state_flags cs_flags; int rv; - mdev = vnr_to_mdev(tconn, pi->vnr); - if (!mdev) - return config_unknown_volume(tconn, pi); - peer_state.i = be32_to_cpu(p->state); real_peer_disk = peer_state.disk; @@ -3842,16 +3209,16 @@ static int receive_state(struct drbd_tconn *tconn, struct packet_info *pi) dev_info(DEV, "real peer disk state = %s\n", drbd_disk_str(real_peer_disk)); } - spin_lock_irq(&mdev->tconn->req_lock); + spin_lock_irq(&mdev->req_lock); retry: - os = ns = drbd_read_state(mdev); - spin_unlock_irq(&mdev->tconn->req_lock); + os = ns = mdev->state; + spin_unlock_irq(&mdev->req_lock); /* If some other part of the code (asender thread, timeout) * already decided to close the connection again, * we must not "re-establish" it here. */ if (os.conn <= C_TEAR_DOWN) - return -ECONNRESET; + return false; /* If this is the "end of sync" confirmation, usually the peer disk * transitions from D_INCONSISTENT to D_UP_TO_DATE. For empty (0 bits @@ -3879,18 +3246,10 @@ static int receive_state(struct drbd_tconn *tconn, struct packet_info *pi) peer_state.conn == C_CONNECTED) { if (drbd_bm_total_weight(mdev) <= mdev->rs_failed) drbd_resync_finished(mdev); - return 0; + return true; } } - /* explicit verify finished notification, stop sector reached. */ - if (os.conn == C_VERIFY_T && os.disk == D_UP_TO_DATE && - peer_state.conn == C_CONNECTED && real_peer_disk == D_UP_TO_DATE) { - ov_out_of_sync_print(mdev); - drbd_resync_finished(mdev); - return 0; - } - /* peer says his disk is inconsistent, while we think it is uptodate, * and this happens while the peer still thinks we have a sync going on, * but we think we are already done with the sync. @@ -3939,17 +3298,17 @@ static int receive_state(struct drbd_tconn *tconn, struct packet_info *pi) peer_state.disk = D_DISKLESS; real_peer_disk = D_DISKLESS; } else { - if (test_and_clear_bit(CONN_DRY_RUN, &mdev->tconn->flags)) - return -EIO; + if (test_and_clear_bit(CONN_DRY_RUN, &mdev->flags)) + return false; D_ASSERT(os.conn == C_WF_REPORT_PARAMS); - conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD); - return -EIO; + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); + return false; } } } - spin_lock_irq(&mdev->tconn->req_lock); - if (os.i != drbd_read_state(mdev).i) + spin_lock_irq(&mdev->req_lock); + if (mdev->state.i != os.i) goto retry; clear_bit(CONSIDER_RESYNC, &mdev->flags); ns.peer = peer_state.role; @@ -3958,25 +3317,25 @@ static int receive_state(struct drbd_tconn *tconn, struct packet_info *pi) if ((ns.conn == C_CONNECTED || ns.conn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING) ns.disk = mdev->new_state_tmp.disk; cs_flags = CS_VERBOSE + (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED ? 0 : CS_HARD); - if (ns.pdsk == D_CONSISTENT && drbd_suspended(mdev) && ns.conn == C_CONNECTED && os.conn < C_CONNECTED && + if (ns.pdsk == D_CONSISTENT && is_susp(ns) && ns.conn == C_CONNECTED && os.conn < C_CONNECTED && test_bit(NEW_CUR_UUID, &mdev->flags)) { - /* Do not allow tl_restart(RESEND) for a rebooted peer. We can only allow this + /* Do not allow tl_restart(resend) for a rebooted peer. We can only allow this for temporal network outages! */ - spin_unlock_irq(&mdev->tconn->req_lock); + spin_unlock_irq(&mdev->req_lock); dev_err(DEV, "Aborting Connect, can not thaw IO with an only Consistent peer\n"); - tl_clear(mdev->tconn); + tl_clear(mdev); drbd_uuid_new_current(mdev); clear_bit(NEW_CUR_UUID, &mdev->flags); - conn_request_state(mdev->tconn, NS2(conn, C_PROTOCOL_ERROR, susp, 0), CS_HARD); - return -EIO; + drbd_force_state(mdev, NS2(conn, C_PROTOCOL_ERROR, susp, 0)); + return false; } rv = _drbd_set_state(mdev, ns, cs_flags, NULL); - ns = drbd_read_state(mdev); - spin_unlock_irq(&mdev->tconn->req_lock); + ns = mdev->state; + spin_unlock_irq(&mdev->req_lock); if (rv < SS_SUCCESS) { - conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD); - return -EIO; + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); + return false; } if (os.conn > C_WF_REPORT_PARAMS) { @@ -3990,21 +3349,16 @@ static int receive_state(struct drbd_tconn *tconn, struct packet_info *pi) } } - clear_bit(DISCARD_MY_DATA, &mdev->flags); + mdev->net_conf->want_lose = 0; drbd_md_sync(mdev); /* update connected indicator, la_size, ... */ - return 0; + return true; } -static int receive_sync_uuid(struct drbd_tconn *tconn, struct packet_info *pi) +static int receive_sync_uuid(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) { - struct drbd_conf *mdev; - struct p_rs_uuid *p = pi->data; - - mdev = vnr_to_mdev(tconn, pi->vnr); - if (!mdev) - return -EIO; + struct p_rs_uuid *p = &mdev->data.rbuf.rs_uuid; wait_event(mdev->misc_wait, mdev->state.conn == C_WF_SYNC_UUID || @@ -4027,7 +3381,7 @@ static int receive_sync_uuid(struct drbd_tconn *tconn, struct packet_info *pi) } else dev_err(DEV, "Ignoring SyncUUID packet!\n"); - return 0; + return true; } /** @@ -4037,27 +3391,27 @@ static int receive_sync_uuid(struct drbd_tconn *tconn, struct packet_info *pi) * code upon failure. */ static int -receive_bitmap_plain(struct drbd_conf *mdev, unsigned int size, - unsigned long *p, struct bm_xfer_ctx *c) +receive_bitmap_plain(struct drbd_conf *mdev, unsigned int data_size, + unsigned long *buffer, struct bm_xfer_ctx *c) { - unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE - - drbd_header_size(mdev->tconn); - unsigned int num_words = min_t(size_t, data_size / sizeof(*p), - c->bm_words - c->word_offset); - unsigned int want = num_words * sizeof(*p); + unsigned num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset); + unsigned want = num_words * sizeof(long); int err; - if (want != size) { - dev_err(DEV, "%s:want (%u) != size (%u)\n", __func__, want, size); + if (want != data_size) { + dev_err(DEV, "%s:want (%u) != data_size (%u)\n", __func__, want, data_size); return -EIO; } if (want == 0) return 0; - err = drbd_recv_all(mdev->tconn, p, want); - if (err) + err = drbd_recv(mdev, buffer, want); + if (err != want) { + if (err >= 0) + err = -EIO; return err; + } - drbd_bm_merge_lel(mdev, c->word_offset, num_words, p); + drbd_bm_merge_lel(mdev, c->word_offset, num_words, buffer); c->word_offset += num_words; c->bit_offset = c->word_offset * BITS_PER_LONG; @@ -4067,21 +3421,6 @@ receive_bitmap_plain(struct drbd_conf *mdev, unsigned int size, return 1; } -static enum drbd_bitmap_code dcbp_get_code(struct p_compressed_bm *p) -{ - return (enum drbd_bitmap_code)(p->encoding & 0x0f); -} - -static int dcbp_get_start(struct p_compressed_bm *p) -{ - return (p->encoding & 0x80) != 0; -} - -static int dcbp_get_pad_bits(struct p_compressed_bm *p) -{ - return (p->encoding >> 4) & 0x7; -} - /** * recv_bm_rle_bits * @@ -4091,8 +3430,7 @@ static int dcbp_get_pad_bits(struct p_compressed_bm *p) static int recv_bm_rle_bits(struct drbd_conf *mdev, struct p_compressed_bm *p, - struct bm_xfer_ctx *c, - unsigned int len) + struct bm_xfer_ctx *c) { struct bitstream bs; u64 look_ahead; @@ -4100,11 +3438,12 @@ recv_bm_rle_bits(struct drbd_conf *mdev, u64 tmp; unsigned long s = c->bit_offset; unsigned long e; - int toggle = dcbp_get_start(p); + int len = be16_to_cpu(p->head.length) - (sizeof(*p) - sizeof(p->head)); + int toggle = DCBP_get_start(p); int have; int bits; - bitstream_init(&bs, p->code, len, dcbp_get_pad_bits(p)); + bitstream_init(&bs, p->code, len, DCBP_get_pad_bits(p)); bits = bitstream_get_bits(&bs, &look_ahead, 64); if (bits < 0) @@ -4156,18 +3495,17 @@ recv_bm_rle_bits(struct drbd_conf *mdev, static int decode_bitmap_c(struct drbd_conf *mdev, struct p_compressed_bm *p, - struct bm_xfer_ctx *c, - unsigned int len) + struct bm_xfer_ctx *c) { - if (dcbp_get_code(p) == RLE_VLI_Bits) - return recv_bm_rle_bits(mdev, p, c, len - sizeof(*p)); + if (DCBP_get_code(p) == RLE_VLI_Bits) + return recv_bm_rle_bits(mdev, p, c); /* other variants had been implemented for evaluation, * but have been dropped as this one turned out to be "best" * during all our tests. */ dev_err(DEV, "receive_bitmap_c: unknown encoding %u\n", p->encoding); - conn_request_state(mdev->tconn, NS(conn, C_PROTOCOL_ERROR), CS_HARD); + drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); return -EIO; } @@ -4175,13 +3513,11 @@ void INFO_bm_xfer_stats(struct drbd_conf *mdev, const char *direction, struct bm_xfer_ctx *c) { /* what would it take to transfer it "plaintext" */ - unsigned int header_size = drbd_header_size(mdev->tconn); - unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE - header_size; - unsigned int plain = - header_size * (DIV_ROUND_UP(c->bm_words, data_size) + 1) + - c->bm_words * sizeof(unsigned long); - unsigned int total = c->bytes[0] + c->bytes[1]; - unsigned int r; + unsigned plain = sizeof(struct p_header80) * + ((c->bm_words+BM_PACKET_WORDS-1)/BM_PACKET_WORDS+1) + + c->bm_words * sizeof(long); + unsigned total = c->bytes[0] + c->bytes[1]; + unsigned r; /* total can not be zero. but just in case: */ if (total == 0) @@ -4215,63 +3551,67 @@ void INFO_bm_xfer_stats(struct drbd_conf *mdev, in order to be agnostic to the 32 vs 64 bits issue. returns 0 on failure, 1 if we successfully received it. */ -static int receive_bitmap(struct drbd_tconn *tconn, struct packet_info *pi) +static int receive_bitmap(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) { - struct drbd_conf *mdev; struct bm_xfer_ctx c; + void *buffer; int err; - - mdev = vnr_to_mdev(tconn, pi->vnr); - if (!mdev) - return -EIO; + int ok = false; + struct p_header80 *h = &mdev->data.rbuf.header.h80; drbd_bm_lock(mdev, "receive bitmap", BM_LOCKED_SET_ALLOWED); /* you are supposed to send additional out-of-sync information * if you actually set bits during this phase */ + /* maybe we should use some per thread scratch page, + * and allocate that during initial device creation? */ + buffer = (unsigned long *) __get_free_page(GFP_NOIO); + if (!buffer) { + dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__); + goto out; + } + c = (struct bm_xfer_ctx) { .bm_bits = drbd_bm_bits(mdev), .bm_words = drbd_bm_words(mdev), }; for(;;) { - if (pi->cmd == P_BITMAP) - err = receive_bitmap_plain(mdev, pi->size, pi->data, &c); - else if (pi->cmd == P_COMPRESSED_BITMAP) { + if (cmd == P_BITMAP) { + err = receive_bitmap_plain(mdev, data_size, buffer, &c); + } else if (cmd == P_COMPRESSED_BITMAP) { /* MAYBE: sanity check that we speak proto >= 90, * and the feature is enabled! */ - struct p_compressed_bm *p = pi->data; + struct p_compressed_bm *p; - if (pi->size > DRBD_SOCKET_BUFFER_SIZE - drbd_header_size(tconn)) { + if (data_size > BM_PACKET_PAYLOAD_BYTES) { dev_err(DEV, "ReportCBitmap packet too large\n"); - err = -EIO; goto out; } - if (pi->size <= sizeof(*p)) { - dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", pi->size); - err = -EIO; + /* use the page buff */ + p = buffer; + memcpy(p, h, sizeof(*h)); + if (drbd_recv(mdev, p->head.payload, data_size) != data_size) + goto out; + if (data_size <= (sizeof(*p) - sizeof(p->head))) { + dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", data_size); goto out; } - err = drbd_recv_all(mdev->tconn, p, pi->size); - if (err) - goto out; - err = decode_bitmap_c(mdev, p, &c, pi->size); + err = decode_bitmap_c(mdev, p, &c); } else { - dev_warn(DEV, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", pi->cmd); - err = -EIO; + dev_warn(DEV, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", cmd); goto out; } - c.packets[pi->cmd == P_BITMAP]++; - c.bytes[pi->cmd == P_BITMAP] += drbd_header_size(tconn) + pi->size; + c.packets[cmd == P_BITMAP]++; + c.bytes[cmd == P_BITMAP] += sizeof(struct p_header80) + data_size; if (err <= 0) { if (err < 0) goto out; break; } - err = drbd_recv_header(mdev->tconn, pi); - if (err) + if (!drbd_recv_header(mdev, &cmd, &data_size)) goto out; } @@ -4280,8 +3620,8 @@ static int receive_bitmap(struct drbd_tconn *tconn, struct packet_info *pi) if (mdev->state.conn == C_WF_BITMAP_T) { enum drbd_state_rv rv; - err = drbd_send_bitmap(mdev); - if (err) + ok = !drbd_send_bitmap(mdev); + if (!ok) goto out; /* Omit CS_ORDERED with this state transition to avoid deadlocks. */ rv = _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE); @@ -4292,40 +3632,47 @@ static int receive_bitmap(struct drbd_tconn *tconn, struct packet_info *pi) dev_info(DEV, "unexpected cstate (%s) in receive_bitmap\n", drbd_conn_str(mdev->state.conn)); } - err = 0; + ok = true; out: drbd_bm_unlock(mdev); - if (!err && mdev->state.conn == C_WF_BITMAP_S) + if (ok && mdev->state.conn == C_WF_BITMAP_S) drbd_start_resync(mdev, C_SYNC_SOURCE); - return err; + free_page((unsigned long) buffer); + return ok; } -static int receive_skip(struct drbd_tconn *tconn, struct packet_info *pi) +static int receive_skip(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) { - conn_warn(tconn, "skipping unknown optional packet type %d, l: %d!\n", - pi->cmd, pi->size); + /* TODO zero copy sink :) */ + static char sink[128]; + int size, want, r; - return ignore_remaining_packet(tconn, pi); + dev_warn(DEV, "skipping unknown optional packet type %d, l: %d!\n", + cmd, data_size); + + size = data_size; + while (size > 0) { + want = min_t(int, size, sizeof(sink)); + r = drbd_recv(mdev, sink, want); + ERR_IF(r <= 0) break; + size -= r; + } + return size == 0; } -static int receive_UnplugRemote(struct drbd_tconn *tconn, struct packet_info *pi) +static int receive_UnplugRemote(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) { /* Make sure we've acked all the TCP data associated * with the data requests being unplugged */ - drbd_tcp_quickack(tconn->data.socket); + drbd_tcp_quickack(mdev->data.socket); - return 0; + return true; } -static int receive_out_of_sync(struct drbd_tconn *tconn, struct packet_info *pi) +static int receive_out_of_sync(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) { - struct drbd_conf *mdev; - struct p_block_desc *p = pi->data; - - mdev = vnr_to_mdev(tconn, pi->vnr); - if (!mdev) - return -EIO; + struct p_block_desc *p = &mdev->data.rbuf.block_desc; switch (mdev->state.conn) { case C_WF_SYNC_UUID: @@ -4339,13 +3686,15 @@ static int receive_out_of_sync(struct drbd_tconn *tconn, struct packet_info *pi) drbd_set_out_of_sync(mdev, be64_to_cpu(p->sector), be32_to_cpu(p->blksize)); - return 0; + return true; } +typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, enum drbd_packets cmd, unsigned int to_receive); + struct data_cmd { int expect_payload; size_t pkt_size; - int (*fn)(struct drbd_tconn *, struct packet_info *); + drbd_cmd_handler_f function; }; static struct data_cmd drbd_cmd_handler[] = { @@ -4353,13 +3702,13 @@ static struct data_cmd drbd_cmd_handler[] = { [P_DATA_REPLY] = { 1, sizeof(struct p_data), receive_DataReply }, [P_RS_DATA_REPLY] = { 1, sizeof(struct p_data), receive_RSDataReply } , [P_BARRIER] = { 0, sizeof(struct p_barrier), receive_Barrier } , - [P_BITMAP] = { 1, 0, receive_bitmap } , - [P_COMPRESSED_BITMAP] = { 1, 0, receive_bitmap } , - [P_UNPLUG_REMOTE] = { 0, 0, receive_UnplugRemote }, + [P_BITMAP] = { 1, sizeof(struct p_header80), receive_bitmap } , + [P_COMPRESSED_BITMAP] = { 1, sizeof(struct p_header80), receive_bitmap } , + [P_UNPLUG_REMOTE] = { 0, sizeof(struct p_header80), receive_UnplugRemote }, [P_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest }, [P_RS_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest }, - [P_SYNC_PARAM] = { 1, 0, receive_SyncParam }, - [P_SYNC_PARAM89] = { 1, 0, receive_SyncParam }, + [P_SYNC_PARAM] = { 1, sizeof(struct p_header80), receive_SyncParam }, + [P_SYNC_PARAM89] = { 1, sizeof(struct p_header80), receive_SyncParam }, [P_PROTOCOL] = { 1, sizeof(struct p_protocol), receive_protocol }, [P_UUIDS] = { 0, sizeof(struct p_uuids), receive_uuids }, [P_SIZES] = { 0, sizeof(struct p_sizes), receive_sizes }, @@ -4371,75 +3720,124 @@ static struct data_cmd drbd_cmd_handler[] = { [P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest }, [P_DELAY_PROBE] = { 0, sizeof(struct p_delay_probe93), receive_skip }, [P_OUT_OF_SYNC] = { 0, sizeof(struct p_block_desc), receive_out_of_sync }, - [P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state }, - [P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol }, + /* anything missing from this table is in + * the asender_tbl, see get_asender_cmd */ + [P_MAX_CMD] = { 0, 0, NULL }, }; -static void drbdd(struct drbd_tconn *tconn) +/* All handler functions that expect a sub-header get that sub-heder in + mdev->data.rbuf.header.head.payload. + + Usually in mdev->data.rbuf.header.head the callback can find the usual + p_header, but they may not rely on that. Since there is also p_header95 ! + */ + +static void drbdd(struct drbd_conf *mdev) { - struct packet_info pi; + union p_header *header = &mdev->data.rbuf.header; + unsigned int packet_size; + enum drbd_packets cmd; size_t shs; /* sub header size */ - int err; - - while (get_t_state(&tconn->receiver) == RUNNING) { - struct data_cmd *cmd; + int rv; - drbd_thread_current_set_cpu(&tconn->receiver); - if (drbd_recv_header(tconn, &pi)) + while (get_t_state(&mdev->receiver) == Running) { + drbd_thread_current_set_cpu(mdev); + if (!drbd_recv_header(mdev, &cmd, &packet_size)) goto err_out; - cmd = &drbd_cmd_handler[pi.cmd]; - if (unlikely(pi.cmd >= ARRAY_SIZE(drbd_cmd_handler) || !cmd->fn)) { - conn_err(tconn, "Unexpected data packet %s (0x%04x)", - cmdname(pi.cmd), pi.cmd); + if (unlikely(cmd >= P_MAX_CMD || !drbd_cmd_handler[cmd].function)) { + dev_err(DEV, "unknown packet type %d, l: %d!\n", cmd, packet_size); goto err_out; } - shs = cmd->pkt_size; - if (pi.size > shs && !cmd->expect_payload) { - conn_err(tconn, "No payload expected %s l:%d\n", - cmdname(pi.cmd), pi.size); + shs = drbd_cmd_handler[cmd].pkt_size - sizeof(union p_header); + if (packet_size - shs > 0 && !drbd_cmd_handler[cmd].expect_payload) { + dev_err(DEV, "No payload expected %s l:%d\n", cmdname(cmd), packet_size); goto err_out; } if (shs) { - err = drbd_recv_all_warn(tconn, pi.data, shs); - if (err) + rv = drbd_recv(mdev, &header->h80.payload, shs); + if (unlikely(rv != shs)) { + if (!signal_pending(current)) + dev_warn(DEV, "short read while reading sub header: rv=%d\n", rv); goto err_out; - pi.size -= shs; + } } - err = cmd->fn(tconn, &pi); - if (err) { - conn_err(tconn, "error receiving %s, e: %d l: %d!\n", - cmdname(pi.cmd), err, pi.size); + rv = drbd_cmd_handler[cmd].function(mdev, cmd, packet_size - shs); + + if (unlikely(!rv)) { + dev_err(DEV, "error receiving %s, l: %d!\n", + cmdname(cmd), packet_size); goto err_out; } } - return; - err_out: - conn_request_state(tconn, NS(conn, C_PROTOCOL_ERROR), CS_HARD); + if (0) { + err_out: + drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); + } + /* If we leave here, we probably want to update at least the + * "Connected" indicator on stable storage. Do so explicitly here. */ + drbd_md_sync(mdev); } -void conn_flush_workqueue(struct drbd_tconn *tconn) +void drbd_flush_workqueue(struct drbd_conf *mdev) { struct drbd_wq_barrier barr; barr.w.cb = w_prev_work_done; - barr.w.tconn = tconn; init_completion(&barr.done); - drbd_queue_work(&tconn->sender_work, &barr.w); + drbd_queue_work(&mdev->data.work, &barr.w); wait_for_completion(&barr.done); } -static void conn_disconnect(struct drbd_tconn *tconn) +void drbd_free_tl_hash(struct drbd_conf *mdev) +{ + struct hlist_head *h; + + spin_lock_irq(&mdev->req_lock); + + if (!mdev->tl_hash || mdev->state.conn != C_STANDALONE) { + spin_unlock_irq(&mdev->req_lock); + return; + } + /* paranoia code */ + for (h = mdev->ee_hash; h < mdev->ee_hash + mdev->ee_hash_s; h++) + if (h->first) + dev_err(DEV, "ASSERT FAILED ee_hash[%u].first == %p, expected NULL\n", + (int)(h - mdev->ee_hash), h->first); + kfree(mdev->ee_hash); + mdev->ee_hash = NULL; + mdev->ee_hash_s = 0; + + /* We may not have had the chance to wait for all locally pending + * application requests. The hlist_add_fake() prevents access after + * free on master bio completion. */ + for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++) { + struct drbd_request *req; + struct hlist_node *pos, *n; + hlist_for_each_entry_safe(req, pos, n, h, collision) { + hlist_del_init(&req->collision); + hlist_add_fake(&req->collision); + } + } + + kfree(mdev->tl_hash); + mdev->tl_hash = NULL; + mdev->tl_hash_s = 0; + spin_unlock_irq(&mdev->req_lock); +} + +static void drbd_disconnect(struct drbd_conf *mdev) { - struct drbd_conf *mdev; - enum drbd_conns oc; - int vnr; + enum drbd_fencing_p fp; + union drbd_state os, ns; + int rv = SS_UNKNOWN_ERROR; + unsigned int i; - if (tconn->cstate == C_STANDALONE) + if (mdev->state.conn == C_STANDALONE) return; /* We are about to start the cleanup after connection loss. @@ -4447,54 +3845,18 @@ static void conn_disconnect(struct drbd_tconn *tconn) * Usually we should be in some network failure state already, * but just in case we are not, we fix it up here. */ - conn_request_state(tconn, NS(conn, C_NETWORK_FAILURE), CS_HARD); + drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE)); /* asender does not clean up anything. it must not interfere, either */ - drbd_thread_stop(&tconn->asender); - drbd_free_sock(tconn); - - rcu_read_lock(); - idr_for_each_entry(&tconn->volumes, mdev, vnr) { - kref_get(&mdev->kref); - rcu_read_unlock(); - drbd_disconnected(mdev); - kref_put(&mdev->kref, &drbd_minor_destroy); - rcu_read_lock(); - } - rcu_read_unlock(); - - if (!list_empty(&tconn->current_epoch->list)) - conn_err(tconn, "ASSERTION FAILED: tconn->current_epoch->list not empty\n"); - /* ok, no more ee's on the fly, it is safe to reset the epoch_size */ - atomic_set(&tconn->current_epoch->epoch_size, 0); - tconn->send.seen_any_write_yet = false; - - conn_info(tconn, "Connection closed\n"); - - if (conn_highest_role(tconn) == R_PRIMARY && conn_highest_pdsk(tconn) >= D_UNKNOWN) - conn_try_outdate_peer_async(tconn); - - spin_lock_irq(&tconn->req_lock); - oc = tconn->cstate; - if (oc >= C_UNCONNECTED) - _conn_request_state(tconn, NS(conn, C_UNCONNECTED), CS_VERBOSE); - - spin_unlock_irq(&tconn->req_lock); - - if (oc == C_DISCONNECTING) - conn_request_state(tconn, NS(conn, C_STANDALONE), CS_VERBOSE | CS_HARD); -} - -static int drbd_disconnected(struct drbd_conf *mdev) -{ - unsigned int i; + drbd_thread_stop(&mdev->asender); + drbd_free_sock(mdev); /* wait for current activity to cease. */ - spin_lock_irq(&mdev->tconn->req_lock); + spin_lock_irq(&mdev->req_lock); _drbd_wait_ee_list_empty(mdev, &mdev->active_ee); _drbd_wait_ee_list_empty(mdev, &mdev->sync_ee); _drbd_wait_ee_list_empty(mdev, &mdev->read_ee); - spin_unlock_irq(&mdev->tconn->req_lock); + spin_unlock_irq(&mdev->req_lock); /* We do not have data structures that would allow us to * get the rs_pending_cnt down to 0 again. @@ -4512,6 +3874,7 @@ static int drbd_disconnected(struct drbd_conf *mdev) atomic_set(&mdev->rs_pending_cnt, 0); wake_up(&mdev->misc_wait); + /* make sure syncer is stopped and w_resume_next_sg queued */ del_timer_sync(&mdev->resync_timer); resync_timer_fn((unsigned long)mdev); @@ -4520,25 +3883,50 @@ static int drbd_disconnected(struct drbd_conf *mdev) * to be "canceled" */ drbd_flush_workqueue(mdev); - drbd_finish_peer_reqs(mdev); - - /* This second workqueue flush is necessary, since drbd_finish_peer_reqs() - might have issued a work again. The one before drbd_finish_peer_reqs() is - necessary to reclain net_ee in drbd_finish_peer_reqs(). */ - drbd_flush_workqueue(mdev); - - /* need to do it again, drbd_finish_peer_reqs() may have populated it - * again via drbd_try_clear_on_disk_bm(). */ - drbd_rs_cancel_all(mdev); + /* This also does reclaim_net_ee(). If we do this too early, we might + * miss some resync ee and pages.*/ + drbd_process_done_ee(mdev); kfree(mdev->p_uuid); mdev->p_uuid = NULL; - if (!drbd_suspended(mdev)) - tl_clear(mdev->tconn); + if (!is_susp(mdev->state)) + tl_clear(mdev); + + dev_info(DEV, "Connection closed\n"); drbd_md_sync(mdev); + fp = FP_DONT_CARE; + if (get_ldev(mdev)) { + fp = mdev->ldev->dc.fencing; + put_ldev(mdev); + } + + if (mdev->state.role == R_PRIMARY && fp >= FP_RESOURCE && mdev->state.pdsk >= D_UNKNOWN) + drbd_try_outdate_peer_async(mdev); + + spin_lock_irq(&mdev->req_lock); + os = mdev->state; + if (os.conn >= C_UNCONNECTED) { + /* Do not restart in case we are C_DISCONNECTING */ + ns = os; + ns.conn = C_UNCONNECTED; + rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL); + } + spin_unlock_irq(&mdev->req_lock); + + if (os.conn == C_DISCONNECTING) { + wait_event(mdev->net_cnt_wait, atomic_read(&mdev->net_cnt) == 0); + + crypto_free_hash(mdev->cram_hmac_tfm); + mdev->cram_hmac_tfm = NULL; + + kfree(mdev->net_conf); + mdev->net_conf = NULL; + drbd_request_state(mdev, NS(conn, C_STANDALONE)); + } + /* serialize with bitmap writeout triggered by the state change, * if any. */ wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); @@ -4550,7 +3938,7 @@ static int drbd_disconnected(struct drbd_conf *mdev) * Actually we don't care for exactly when the network stack does its * put_page(), but release our reference on these pages right here. */ - i = drbd_free_peer_reqs(mdev, &mdev->net_ee); + i = drbd_release_ee(mdev, &mdev->net_ee); if (i) dev_info(DEV, "net_ee not empty, killed %u entries\n", i); i = atomic_read(&mdev->pp_in_use_by_net); @@ -4565,7 +3953,9 @@ static int drbd_disconnected(struct drbd_conf *mdev) D_ASSERT(list_empty(&mdev->sync_ee)); D_ASSERT(list_empty(&mdev->done_ee)); - return 0; + /* ok, no more ee's on the fly, it is safe to reset the epoch_size */ + atomic_set(&mdev->current_epoch->epoch_size, 0); + D_ASSERT(list_empty(&mdev->current_epoch->list)); } /* @@ -4577,19 +3967,29 @@ static int drbd_disconnected(struct drbd_conf *mdev) * * for now, they are expected to be zero, but ignored. */ -static int drbd_send_features(struct drbd_tconn *tconn) +static int drbd_send_handshake(struct drbd_conf *mdev) { - struct drbd_socket *sock; - struct p_connection_features *p; + /* ASSERT current == mdev->receiver ... */ + struct p_handshake *p = &mdev->data.sbuf.handshake; + int ok; + + if (mutex_lock_interruptible(&mdev->data.mutex)) { + dev_err(DEV, "interrupted during initial handshake\n"); + return 0; /* interrupted. not ok. */ + } + + if (mdev->data.socket == NULL) { + mutex_unlock(&mdev->data.mutex); + return 0; + } - sock = &tconn->data; - p = conn_prepare_command(tconn, sock); - if (!p) - return -EIO; memset(p, 0, sizeof(*p)); p->protocol_min = cpu_to_be32(PRO_VERSION_MIN); p->protocol_max = cpu_to_be32(PRO_VERSION_MAX); - return conn_send_command(tconn, sock, P_CONNECTION_FEATURES, sizeof(*p), NULL, 0); + ok = _drbd_send_cmd( mdev, mdev->data.socket, P_HAND_SHAKE, + (struct p_header80 *)p, sizeof(*p), 0 ); + mutex_unlock(&mdev->data.mutex); + return ok; } /* @@ -4599,38 +3999,42 @@ static int drbd_send_features(struct drbd_tconn *tconn) * -1 peer talks different language, * no point in trying again, please go standalone. */ -static int drbd_do_features(struct drbd_tconn *tconn) +static int drbd_do_handshake(struct drbd_conf *mdev) { - /* ASSERT current == tconn->receiver ... */ - struct p_connection_features *p; - const int expect = sizeof(struct p_connection_features); - struct packet_info pi; - int err; + /* ASSERT current == mdev->receiver ... */ + struct p_handshake *p = &mdev->data.rbuf.handshake; + const int expect = sizeof(struct p_handshake) - sizeof(struct p_header80); + unsigned int length; + enum drbd_packets cmd; + int rv; - err = drbd_send_features(tconn); - if (err) + rv = drbd_send_handshake(mdev); + if (!rv) return 0; - err = drbd_recv_header(tconn, &pi); - if (err) + rv = drbd_recv_header(mdev, &cmd, &length); + if (!rv) return 0; - if (pi.cmd != P_CONNECTION_FEATURES) { - conn_err(tconn, "expected ConnectionFeatures packet, received: %s (0x%04x)\n", - cmdname(pi.cmd), pi.cmd); + if (cmd != P_HAND_SHAKE) { + dev_err(DEV, "expected HandShake packet, received: %s (0x%04x)\n", + cmdname(cmd), cmd); return -1; } - if (pi.size != expect) { - conn_err(tconn, "expected ConnectionFeatures length: %u, received: %u\n", - expect, pi.size); + if (length != expect) { + dev_err(DEV, "expected HandShake length: %u, received: %u\n", + expect, length); return -1; } - p = pi.data; - err = drbd_recv_all_warn(tconn, p, expect); - if (err) + rv = drbd_recv(mdev, &p->head.payload, expect); + + if (rv != expect) { + if (!signal_pending(current)) + dev_warn(DEV, "short read receiving handshake packet: l=%u\n", rv); return 0; + } p->protocol_min = be32_to_cpu(p->protocol_min); p->protocol_max = be32_to_cpu(p->protocol_max); @@ -4641,15 +4045,15 @@ static int drbd_do_features(struct drbd_tconn *tconn) PRO_VERSION_MIN > p->protocol_max) goto incompat; - tconn->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max); + mdev->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max); - conn_info(tconn, "Handshake successful: " - "Agreed network protocol version %d\n", tconn->agreed_pro_version); + dev_info(DEV, "Handshake successful: " + "Agreed network protocol version %d\n", mdev->agreed_pro_version); return 1; incompat: - conn_err(tconn, "incompatible DRBD dialects: " + dev_err(DEV, "incompatible DRBD dialects: " "I support %d-%d, peer supports %d-%d\n", PRO_VERSION_MIN, PRO_VERSION_MAX, p->protocol_min, p->protocol_max); @@ -4657,7 +4061,7 @@ static int drbd_do_features(struct drbd_tconn *tconn) } #if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE) -static int drbd_do_auth(struct drbd_tconn *tconn) +static int drbd_do_auth(struct drbd_conf *mdev) { dev_err(DEV, "This kernel was build without CONFIG_CRYPTO_HMAC.\n"); dev_err(DEV, "You need to disable 'cram-hmac-alg' in drbd.conf.\n"); @@ -4672,139 +4076,121 @@ static int drbd_do_auth(struct drbd_tconn *tconn) -1 - auth failed, don't try again. */ -static int drbd_do_auth(struct drbd_tconn *tconn) +static int drbd_do_auth(struct drbd_conf *mdev) { - struct drbd_socket *sock; char my_challenge[CHALLENGE_LEN]; /* 64 Bytes... */ struct scatterlist sg; char *response = NULL; char *right_response = NULL; char *peers_ch = NULL; - unsigned int key_len; - char secret[SHARED_SECRET_MAX]; /* 64 byte */ + unsigned int key_len = strlen(mdev->net_conf->shared_secret); unsigned int resp_size; struct hash_desc desc; - struct packet_info pi; - struct net_conf *nc; - int err, rv; - - /* FIXME: Put the challenge/response into the preallocated socket buffer. */ - - rcu_read_lock(); - nc = rcu_dereference(tconn->net_conf); - key_len = strlen(nc->shared_secret); - memcpy(secret, nc->shared_secret, key_len); - rcu_read_unlock(); + enum drbd_packets cmd; + unsigned int length; + int rv; - desc.tfm = tconn->cram_hmac_tfm; + desc.tfm = mdev->cram_hmac_tfm; desc.flags = 0; - rv = crypto_hash_setkey(tconn->cram_hmac_tfm, (u8 *)secret, key_len); + rv = crypto_hash_setkey(mdev->cram_hmac_tfm, + (u8 *)mdev->net_conf->shared_secret, key_len); if (rv) { - conn_err(tconn, "crypto_hash_setkey() failed with %d\n", rv); + dev_err(DEV, "crypto_hash_setkey() failed with %d\n", rv); rv = -1; goto fail; } get_random_bytes(my_challenge, CHALLENGE_LEN); - sock = &tconn->data; - if (!conn_prepare_command(tconn, sock)) { - rv = 0; - goto fail; - } - rv = !conn_send_command(tconn, sock, P_AUTH_CHALLENGE, 0, - my_challenge, CHALLENGE_LEN); + rv = drbd_send_cmd2(mdev, P_AUTH_CHALLENGE, my_challenge, CHALLENGE_LEN); if (!rv) goto fail; - err = drbd_recv_header(tconn, &pi); - if (err) { - rv = 0; + rv = drbd_recv_header(mdev, &cmd, &length); + if (!rv) goto fail; - } - if (pi.cmd != P_AUTH_CHALLENGE) { - conn_err(tconn, "expected AuthChallenge packet, received: %s (0x%04x)\n", - cmdname(pi.cmd), pi.cmd); + if (cmd != P_AUTH_CHALLENGE) { + dev_err(DEV, "expected AuthChallenge packet, received: %s (0x%04x)\n", + cmdname(cmd), cmd); rv = 0; goto fail; } - if (pi.size > CHALLENGE_LEN * 2) { - conn_err(tconn, "expected AuthChallenge payload too big.\n"); + if (length > CHALLENGE_LEN * 2) { + dev_err(DEV, "expected AuthChallenge payload too big.\n"); rv = -1; goto fail; } - peers_ch = kmalloc(pi.size, GFP_NOIO); + peers_ch = kmalloc(length, GFP_NOIO); if (peers_ch == NULL) { - conn_err(tconn, "kmalloc of peers_ch failed\n"); + dev_err(DEV, "kmalloc of peers_ch failed\n"); rv = -1; goto fail; } - err = drbd_recv_all_warn(tconn, peers_ch, pi.size); - if (err) { + rv = drbd_recv(mdev, peers_ch, length); + + if (rv != length) { + if (!signal_pending(current)) + dev_warn(DEV, "short read AuthChallenge: l=%u\n", rv); rv = 0; goto fail; } - resp_size = crypto_hash_digestsize(tconn->cram_hmac_tfm); + resp_size = crypto_hash_digestsize(mdev->cram_hmac_tfm); response = kmalloc(resp_size, GFP_NOIO); if (response == NULL) { - conn_err(tconn, "kmalloc of response failed\n"); + dev_err(DEV, "kmalloc of response failed\n"); rv = -1; goto fail; } sg_init_table(&sg, 1); - sg_set_buf(&sg, peers_ch, pi.size); + sg_set_buf(&sg, peers_ch, length); rv = crypto_hash_digest(&desc, &sg, sg.length, response); if (rv) { - conn_err(tconn, "crypto_hash_digest() failed with %d\n", rv); + dev_err(DEV, "crypto_hash_digest() failed with %d\n", rv); rv = -1; goto fail; } - if (!conn_prepare_command(tconn, sock)) { - rv = 0; - goto fail; - } - rv = !conn_send_command(tconn, sock, P_AUTH_RESPONSE, 0, - response, resp_size); + rv = drbd_send_cmd2(mdev, P_AUTH_RESPONSE, response, resp_size); if (!rv) goto fail; - err = drbd_recv_header(tconn, &pi); - if (err) { - rv = 0; + rv = drbd_recv_header(mdev, &cmd, &length); + if (!rv) goto fail; - } - if (pi.cmd != P_AUTH_RESPONSE) { - conn_err(tconn, "expected AuthResponse packet, received: %s (0x%04x)\n", - cmdname(pi.cmd), pi.cmd); + if (cmd != P_AUTH_RESPONSE) { + dev_err(DEV, "expected AuthResponse packet, received: %s (0x%04x)\n", + cmdname(cmd), cmd); rv = 0; goto fail; } - if (pi.size != resp_size) { - conn_err(tconn, "expected AuthResponse payload of wrong size\n"); + if (length != resp_size) { + dev_err(DEV, "expected AuthResponse payload of wrong size\n"); rv = 0; goto fail; } - err = drbd_recv_all_warn(tconn, response , resp_size); - if (err) { + rv = drbd_recv(mdev, response , resp_size); + + if (rv != resp_size) { + if (!signal_pending(current)) + dev_warn(DEV, "short read receiving AuthResponse: l=%u\n", rv); rv = 0; goto fail; } right_response = kmalloc(resp_size, GFP_NOIO); if (right_response == NULL) { - conn_err(tconn, "kmalloc of right_response failed\n"); + dev_err(DEV, "kmalloc of right_response failed\n"); rv = -1; goto fail; } @@ -4813,7 +4199,7 @@ static int drbd_do_auth(struct drbd_tconn *tconn) rv = crypto_hash_digest(&desc, &sg, sg.length, right_response); if (rv) { - conn_err(tconn, "crypto_hash_digest() failed with %d\n", rv); + dev_err(DEV, "crypto_hash_digest() failed with %d\n", rv); rv = -1; goto fail; } @@ -4821,8 +4207,8 @@ static int drbd_do_auth(struct drbd_tconn *tconn) rv = !memcmp(response, right_response, resp_size); if (rv) - conn_info(tconn, "Peer authenticated using %d bytes HMAC\n", - resp_size); + dev_info(DEV, "Peer authenticated using %d bytes of '%s' HMAC\n", + resp_size, mdev->net_conf->cram_hmac_alg); else rv = -1; @@ -4837,106 +4223,82 @@ static int drbd_do_auth(struct drbd_tconn *tconn) int drbdd_init(struct drbd_thread *thi) { - struct drbd_tconn *tconn = thi->tconn; + struct drbd_conf *mdev = thi->mdev; + unsigned int minor = mdev_to_minor(mdev); int h; - conn_info(tconn, "receiver (re)started\n"); + sprintf(current->comm, "drbd%d_receiver", minor); + + dev_info(DEV, "receiver (re)started\n"); do { - h = conn_connect(tconn); + h = drbd_connect(mdev); if (h == 0) { - conn_disconnect(tconn); + drbd_disconnect(mdev); schedule_timeout_interruptible(HZ); } if (h == -1) { - conn_warn(tconn, "Discarding network configuration.\n"); - conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD); + dev_warn(DEV, "Discarding network configuration.\n"); + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); } } while (h == 0); - if (h > 0) - drbdd(tconn); + if (h > 0) { + if (get_net_conf(mdev)) { + drbdd(mdev); + put_net_conf(mdev); + } + } - conn_disconnect(tconn); + drbd_disconnect(mdev); - conn_info(tconn, "receiver terminated\n"); + dev_info(DEV, "receiver terminated\n"); return 0; } /* ********* acknowledge sender ******** */ -static int got_conn_RqSReply(struct drbd_tconn *tconn, struct packet_info *pi) +static int got_RqSReply(struct drbd_conf *mdev, struct p_header80 *h) { - struct p_req_state_reply *p = pi->data; - int retcode = be32_to_cpu(p->retcode); - - if (retcode >= SS_SUCCESS) { - set_bit(CONN_WD_ST_CHG_OKAY, &tconn->flags); - } else { - set_bit(CONN_WD_ST_CHG_FAIL, &tconn->flags); - conn_err(tconn, "Requested state change failed by peer: %s (%d)\n", - drbd_set_st_err_str(retcode), retcode); - } - wake_up(&tconn->ping_wait); - - return 0; -} + struct p_req_state_reply *p = (struct p_req_state_reply *)h; -static int got_RqSReply(struct drbd_tconn *tconn, struct packet_info *pi) -{ - struct drbd_conf *mdev; - struct p_req_state_reply *p = pi->data; int retcode = be32_to_cpu(p->retcode); - mdev = vnr_to_mdev(tconn, pi->vnr); - if (!mdev) - return -EIO; - - if (test_bit(CONN_WD_ST_CHG_REQ, &tconn->flags)) { - D_ASSERT(tconn->agreed_pro_version < 100); - return got_conn_RqSReply(tconn, pi); - } - if (retcode >= SS_SUCCESS) { set_bit(CL_ST_CHG_SUCCESS, &mdev->flags); } else { set_bit(CL_ST_CHG_FAIL, &mdev->flags); dev_err(DEV, "Requested state change failed by peer: %s (%d)\n", - drbd_set_st_err_str(retcode), retcode); + drbd_set_st_err_str(retcode), retcode); } wake_up(&mdev->state_wait); - return 0; + return true; } -static int got_Ping(struct drbd_tconn *tconn, struct packet_info *pi) +static int got_Ping(struct drbd_conf *mdev, struct p_header80 *h) { - return drbd_send_ping_ack(tconn); + return drbd_send_ping_ack(mdev); } -static int got_PingAck(struct drbd_tconn *tconn, struct packet_info *pi) +static int got_PingAck(struct drbd_conf *mdev, struct p_header80 *h) { /* restore idle timeout */ - tconn->meta.socket->sk->sk_rcvtimeo = tconn->net_conf->ping_int*HZ; - if (!test_and_set_bit(GOT_PING_ACK, &tconn->flags)) - wake_up(&tconn->ping_wait); + mdev->meta.socket->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ; + if (!test_and_set_bit(GOT_PING_ACK, &mdev->flags)) + wake_up(&mdev->misc_wait); - return 0; + return true; } -static int got_IsInSync(struct drbd_tconn *tconn, struct packet_info *pi) +static int got_IsInSync(struct drbd_conf *mdev, struct p_header80 *h) { - struct drbd_conf *mdev; - struct p_block_ack *p = pi->data; + struct p_block_ack *p = (struct p_block_ack *)h; sector_t sector = be64_to_cpu(p->sector); int blksize = be32_to_cpu(p->blksize); - mdev = vnr_to_mdev(tconn, pi->vnr); - if (!mdev) - return -EIO; - - D_ASSERT(mdev->tconn->agreed_pro_version >= 89); + D_ASSERT(mdev->agreed_pro_version >= 89); update_peer_seq(mdev, be32_to_cpu(p->seq_num)); @@ -4950,139 +4312,162 @@ static int got_IsInSync(struct drbd_tconn *tconn, struct packet_info *pi) dec_rs_pending(mdev); atomic_add(blksize >> 9, &mdev->rs_sect_in); - return 0; + return true; } -static int -validate_req_change_req_state(struct drbd_conf *mdev, u64 id, sector_t sector, - struct rb_root *root, const char *func, - enum drbd_req_event what, bool missing_ok) +/* when we receive the ACK for a write request, + * verify that we actually know about it */ +static struct drbd_request *_ack_id_to_req(struct drbd_conf *mdev, + u64 id, sector_t sector) +{ + struct hlist_head *slot = tl_hash_slot(mdev, sector); + struct hlist_node *n; + struct drbd_request *req; + + hlist_for_each_entry(req, n, slot, collision) { + if ((unsigned long)req == (unsigned long)id) { + if (req->sector != sector) { + dev_err(DEV, "_ack_id_to_req: found req %p but it has " + "wrong sector (%llus versus %llus)\n", req, + (unsigned long long)req->sector, + (unsigned long long)sector); + break; + } + return req; + } + } + return NULL; +} + +typedef struct drbd_request *(req_validator_fn) + (struct drbd_conf *mdev, u64 id, sector_t sector); + +static int validate_req_change_req_state(struct drbd_conf *mdev, + u64 id, sector_t sector, req_validator_fn validator, + const char *func, enum drbd_req_event what) { struct drbd_request *req; struct bio_and_error m; - spin_lock_irq(&mdev->tconn->req_lock); - req = find_request(mdev, root, id, sector, missing_ok, func); + spin_lock_irq(&mdev->req_lock); + req = validator(mdev, id, sector); if (unlikely(!req)) { - spin_unlock_irq(&mdev->tconn->req_lock); - return -EIO; + spin_unlock_irq(&mdev->req_lock); + + dev_err(DEV, "%s: failed to find req %p, sector %llus\n", func, + (void *)(unsigned long)id, (unsigned long long)sector); + return false; } __req_mod(req, what, &m); - spin_unlock_irq(&mdev->tconn->req_lock); + spin_unlock_irq(&mdev->req_lock); if (m.bio) complete_master_bio(mdev, &m); - return 0; + return true; } -static int got_BlockAck(struct drbd_tconn *tconn, struct packet_info *pi) +static int got_BlockAck(struct drbd_conf *mdev, struct p_header80 *h) { - struct drbd_conf *mdev; - struct p_block_ack *p = pi->data; + struct p_block_ack *p = (struct p_block_ack *)h; sector_t sector = be64_to_cpu(p->sector); int blksize = be32_to_cpu(p->blksize); enum drbd_req_event what; - mdev = vnr_to_mdev(tconn, pi->vnr); - if (!mdev) - return -EIO; - update_peer_seq(mdev, be32_to_cpu(p->seq_num)); - if (p->block_id == ID_SYNCER) { + if (is_syncer_block_id(p->block_id)) { drbd_set_in_sync(mdev, sector, blksize); dec_rs_pending(mdev); - return 0; + return true; } - switch (pi->cmd) { + switch (be16_to_cpu(h->command)) { case P_RS_WRITE_ACK: - what = WRITE_ACKED_BY_PEER_AND_SIS; + D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); + what = write_acked_by_peer_and_sis; break; case P_WRITE_ACK: - what = WRITE_ACKED_BY_PEER; + D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); + what = write_acked_by_peer; break; case P_RECV_ACK: - what = RECV_ACKED_BY_PEER; + D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_B); + what = recv_acked_by_peer; break; - case P_SUPERSEDED: - what = CONFLICT_RESOLVED; - break; - case P_RETRY_WRITE: - what = POSTPONE_WRITE; + case P_DISCARD_ACK: + D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); + what = conflict_discarded_by_peer; break; default: - BUG(); + D_ASSERT(0); + return false; } return validate_req_change_req_state(mdev, p->block_id, sector, - &mdev->write_requests, __func__, - what, false); + _ack_id_to_req, __func__ , what); } -static int got_NegAck(struct drbd_tconn *tconn, struct packet_info *pi) +static int got_NegAck(struct drbd_conf *mdev, struct p_header80 *h) { - struct drbd_conf *mdev; - struct p_block_ack *p = pi->data; + struct p_block_ack *p = (struct p_block_ack *)h; sector_t sector = be64_to_cpu(p->sector); int size = be32_to_cpu(p->blksize); - int err; - - mdev = vnr_to_mdev(tconn, pi->vnr); - if (!mdev) - return -EIO; + struct drbd_request *req; + struct bio_and_error m; update_peer_seq(mdev, be32_to_cpu(p->seq_num)); - if (p->block_id == ID_SYNCER) { + if (is_syncer_block_id(p->block_id)) { dec_rs_pending(mdev); drbd_rs_failed_io(mdev, sector, size); - return 0; + return true; } - err = validate_req_change_req_state(mdev, p->block_id, sector, - &mdev->write_requests, __func__, - NEG_ACKED, true); - if (err) { - /* Protocol A has no P_WRITE_ACKs, but has P_NEG_ACKs. - The master bio might already be completed, therefore the - request is no longer in the collision hash. */ - /* In Protocol B we might already have got a P_RECV_ACK - but then get a P_NEG_ACK afterwards. */ - drbd_set_out_of_sync(mdev, sector, size); + spin_lock_irq(&mdev->req_lock); + req = _ack_id_to_req(mdev, p->block_id, sector); + if (!req) { + spin_unlock_irq(&mdev->req_lock); + if (mdev->net_conf->wire_protocol == DRBD_PROT_A || + mdev->net_conf->wire_protocol == DRBD_PROT_B) { + /* Protocol A has no P_WRITE_ACKs, but has P_NEG_ACKs. + The master bio might already be completed, therefore the + request is no longer in the collision hash. + => Do not try to validate block_id as request. */ + /* In Protocol B we might already have got a P_RECV_ACK + but then get a P_NEG_ACK after wards. */ + drbd_set_out_of_sync(mdev, sector, size); + return true; + } else { + dev_err(DEV, "%s: failed to find req %p, sector %llus\n", __func__, + (void *)(unsigned long)p->block_id, (unsigned long long)sector); + return false; + } } - return 0; + __req_mod(req, neg_acked, &m); + spin_unlock_irq(&mdev->req_lock); + + if (m.bio) + complete_master_bio(mdev, &m); + return true; } -static int got_NegDReply(struct drbd_tconn *tconn, struct packet_info *pi) +static int got_NegDReply(struct drbd_conf *mdev, struct p_header80 *h) { - struct drbd_conf *mdev; - struct p_block_ack *p = pi->data; + struct p_block_ack *p = (struct p_block_ack *)h; sector_t sector = be64_to_cpu(p->sector); - mdev = vnr_to_mdev(tconn, pi->vnr); - if (!mdev) - return -EIO; - update_peer_seq(mdev, be32_to_cpu(p->seq_num)); - - dev_err(DEV, "Got NegDReply; Sector %llus, len %u.\n", + dev_err(DEV, "Got NegDReply; Sector %llus, len %u; Fail original request.\n", (unsigned long long)sector, be32_to_cpu(p->blksize)); return validate_req_change_req_state(mdev, p->block_id, sector, - &mdev->read_requests, __func__, - NEG_ACKED, false); + _ar_id_to_req, __func__ , neg_acked); } -static int got_NegRSDReply(struct drbd_tconn *tconn, struct packet_info *pi) +static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header80 *h) { - struct drbd_conf *mdev; sector_t sector; int size; - struct p_block_ack *p = pi->data; - - mdev = vnr_to_mdev(tconn, pi->vnr); - if (!mdev) - return -EIO; + struct p_block_ack *p = (struct p_block_ack *)h; sector = be64_to_cpu(p->sector); size = be32_to_cpu(p->blksize); @@ -5093,66 +4478,57 @@ static int got_NegRSDReply(struct drbd_tconn *tconn, struct packet_info *pi) if (get_ldev_if_state(mdev, D_FAILED)) { drbd_rs_complete_io(mdev, sector); - switch (pi->cmd) { + switch (be16_to_cpu(h->command)) { case P_NEG_RS_DREPLY: drbd_rs_failed_io(mdev, sector, size); case P_RS_CANCEL: break; default: - BUG(); + D_ASSERT(0); + put_ldev(mdev); + return false; } put_ldev(mdev); } - return 0; + return true; } -static int got_BarrierAck(struct drbd_tconn *tconn, struct packet_info *pi) +static int got_BarrierAck(struct drbd_conf *mdev, struct p_header80 *h) { - struct p_barrier_ack *p = pi->data; - struct drbd_conf *mdev; - int vnr; - - tl_release(tconn, p->barrier, be32_to_cpu(p->set_size)); - - rcu_read_lock(); - idr_for_each_entry(&tconn->volumes, mdev, vnr) { - if (mdev->state.conn == C_AHEAD && - atomic_read(&mdev->ap_in_flight) == 0 && - !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &mdev->flags)) { - mdev->start_resync_timer.expires = jiffies + HZ; - add_timer(&mdev->start_resync_timer); - } + struct p_barrier_ack *p = (struct p_barrier_ack *)h; + + tl_release(mdev, p->barrier, be32_to_cpu(p->set_size)); + + if (mdev->state.conn == C_AHEAD && + atomic_read(&mdev->ap_in_flight) == 0 && + !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &mdev->flags)) { + mdev->start_resync_timer.expires = jiffies + HZ; + add_timer(&mdev->start_resync_timer); } - rcu_read_unlock(); - return 0; + return true; } -static int got_OVResult(struct drbd_tconn *tconn, struct packet_info *pi) +static int got_OVResult(struct drbd_conf *mdev, struct p_header80 *h) { - struct drbd_conf *mdev; - struct p_block_ack *p = pi->data; + struct p_block_ack *p = (struct p_block_ack *)h; struct drbd_work *w; sector_t sector; int size; - mdev = vnr_to_mdev(tconn, pi->vnr); - if (!mdev) - return -EIO; - sector = be64_to_cpu(p->sector); size = be32_to_cpu(p->blksize); update_peer_seq(mdev, be32_to_cpu(p->seq_num)); if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC) - drbd_ov_out_of_sync_found(mdev, sector, size); + drbd_ov_oos_found(mdev, sector, size); else - ov_out_of_sync_print(mdev); + ov_oos_print(mdev); if (!get_ldev(mdev)) - return 0; + return true; drbd_rs_complete_io(mdev, sector); dec_rs_pending(mdev); @@ -5167,137 +4543,114 @@ static int got_OVResult(struct drbd_tconn *tconn, struct packet_info *pi) w = kmalloc(sizeof(*w), GFP_NOIO); if (w) { w->cb = w_ov_finished; - w->mdev = mdev; - drbd_queue_work(&mdev->tconn->sender_work, w); + drbd_queue_work_front(&mdev->data.work, w); } else { dev_err(DEV, "kmalloc(w) failed."); - ov_out_of_sync_print(mdev); + ov_oos_print(mdev); drbd_resync_finished(mdev); } } put_ldev(mdev); - return 0; + return true; } -static int got_skip(struct drbd_tconn *tconn, struct packet_info *pi) +static int got_skip(struct drbd_conf *mdev, struct p_header80 *h) { - return 0; -} - -static int tconn_finish_peer_reqs(struct drbd_tconn *tconn) -{ - struct drbd_conf *mdev; - int vnr, not_empty = 0; - - do { - clear_bit(SIGNAL_ASENDER, &tconn->flags); - flush_signals(current); - - rcu_read_lock(); - idr_for_each_entry(&tconn->volumes, mdev, vnr) { - kref_get(&mdev->kref); - rcu_read_unlock(); - if (drbd_finish_peer_reqs(mdev)) { - kref_put(&mdev->kref, &drbd_minor_destroy); - return 1; - } - kref_put(&mdev->kref, &drbd_minor_destroy); - rcu_read_lock(); - } - set_bit(SIGNAL_ASENDER, &tconn->flags); - - spin_lock_irq(&tconn->req_lock); - idr_for_each_entry(&tconn->volumes, mdev, vnr) { - not_empty = !list_empty(&mdev->done_ee); - if (not_empty) - break; - } - spin_unlock_irq(&tconn->req_lock); - rcu_read_unlock(); - } while (not_empty); - - return 0; + return true; } struct asender_cmd { size_t pkt_size; - int (*fn)(struct drbd_tconn *tconn, struct packet_info *); + int (*process)(struct drbd_conf *mdev, struct p_header80 *h); }; -static struct asender_cmd asender_tbl[] = { - [P_PING] = { 0, got_Ping }, - [P_PING_ACK] = { 0, got_PingAck }, +static struct asender_cmd *get_asender_cmd(int cmd) +{ + static struct asender_cmd asender_tbl[] = { + /* anything missing from this table is in + * the drbd_cmd_handler (drbd_default_handler) table, + * see the beginning of drbdd() */ + [P_PING] = { sizeof(struct p_header80), got_Ping }, + [P_PING_ACK] = { sizeof(struct p_header80), got_PingAck }, [P_RECV_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, [P_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, [P_RS_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, - [P_SUPERSEDED] = { sizeof(struct p_block_ack), got_BlockAck }, + [P_DISCARD_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, [P_NEG_ACK] = { sizeof(struct p_block_ack), got_NegAck }, [P_NEG_DREPLY] = { sizeof(struct p_block_ack), got_NegDReply }, - [P_NEG_RS_DREPLY] = { sizeof(struct p_block_ack), got_NegRSDReply }, + [P_NEG_RS_DREPLY] = { sizeof(struct p_block_ack), got_NegRSDReply}, [P_OV_RESULT] = { sizeof(struct p_block_ack), got_OVResult }, [P_BARRIER_ACK] = { sizeof(struct p_barrier_ack), got_BarrierAck }, [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply }, [P_RS_IS_IN_SYNC] = { sizeof(struct p_block_ack), got_IsInSync }, [P_DELAY_PROBE] = { sizeof(struct p_delay_probe93), got_skip }, - [P_RS_CANCEL] = { sizeof(struct p_block_ack), got_NegRSDReply }, - [P_CONN_ST_CHG_REPLY]={ sizeof(struct p_req_state_reply), got_conn_RqSReply }, - [P_RETRY_WRITE] = { sizeof(struct p_block_ack), got_BlockAck }, -}; + [P_RS_CANCEL] = { sizeof(struct p_block_ack), got_NegRSDReply}, + [P_MAX_CMD] = { 0, NULL }, + }; + if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL) + return NULL; + return &asender_tbl[cmd]; +} int drbd_asender(struct drbd_thread *thi) { - struct drbd_tconn *tconn = thi->tconn; + struct drbd_conf *mdev = thi->mdev; + struct p_header80 *h = &mdev->meta.rbuf.header.h80; struct asender_cmd *cmd = NULL; - struct packet_info pi; - int rv; - void *buf = tconn->meta.rbuf; + + int rv, len; + void *buf = h; int received = 0; - unsigned int header_size = drbd_header_size(tconn); - int expect = header_size; - bool ping_timeout_active = false; - struct net_conf *nc; - int ping_timeo, tcp_cork, ping_int; + int expect = sizeof(struct p_header80); + int empty; + int ping_timeout_active = 0; + + sprintf(current->comm, "drbd%d_asender", mdev_to_minor(mdev)); current->policy = SCHED_RR; /* Make this a realtime task! */ current->rt_priority = 2; /* more important than all other tasks */ - while (get_t_state(thi) == RUNNING) { - drbd_thread_current_set_cpu(thi); - - rcu_read_lock(); - nc = rcu_dereference(tconn->net_conf); - ping_timeo = nc->ping_timeo; - tcp_cork = nc->tcp_cork; - ping_int = nc->ping_int; - rcu_read_unlock(); - - if (test_and_clear_bit(SEND_PING, &tconn->flags)) { - if (drbd_send_ping(tconn)) { - conn_err(tconn, "drbd_send_ping has failed\n"); - goto reconnect; - } - tconn->meta.socket->sk->sk_rcvtimeo = ping_timeo * HZ / 10; - ping_timeout_active = true; + while (get_t_state(thi) == Running) { + drbd_thread_current_set_cpu(mdev); + if (test_and_clear_bit(SEND_PING, &mdev->flags)) { + ERR_IF(!drbd_send_ping(mdev)) goto reconnect; + mdev->meta.socket->sk->sk_rcvtimeo = + mdev->net_conf->ping_timeo*HZ/10; + ping_timeout_active = 1; } - /* TODO: conditionally cork; it may hurt latency if we cork without - much to send */ - if (tcp_cork) - drbd_tcp_cork(tconn->meta.socket); - if (tconn_finish_peer_reqs(tconn)) { - conn_err(tconn, "tconn_finish_peer_reqs() failed\n"); - goto reconnect; + /* conditionally cork; + * it may hurt latency if we cork without much to send */ + if (!mdev->net_conf->no_cork && + 3 < atomic_read(&mdev->unacked_cnt)) + drbd_tcp_cork(mdev->meta.socket); + while (1) { + clear_bit(SIGNAL_ASENDER, &mdev->flags); + flush_signals(current); + if (!drbd_process_done_ee(mdev)) + goto reconnect; + /* to avoid race with newly queued ACKs */ + set_bit(SIGNAL_ASENDER, &mdev->flags); + spin_lock_irq(&mdev->req_lock); + empty = list_empty(&mdev->done_ee); + spin_unlock_irq(&mdev->req_lock); + /* new ack may have been queued right here, + * but then there is also a signal pending, + * and we start over... */ + if (empty) + break; } /* but unconditionally uncork unless disabled */ - if (tcp_cork) - drbd_tcp_uncork(tconn->meta.socket); + if (!mdev->net_conf->no_cork) + drbd_tcp_uncork(mdev->meta.socket); /* short circuit, recv_msg would return EINTR anyways. */ if (signal_pending(current)) continue; - rv = drbd_recv_short(tconn->meta.socket, buf, expect-received, 0); - clear_bit(SIGNAL_ASENDER, &tconn->flags); + rv = drbd_recv_short(mdev, mdev->meta.socket, + buf, expect-received, 0); + clear_bit(SIGNAL_ASENDER, &mdev->flags); flush_signals(current); @@ -5315,91 +4668,80 @@ int drbd_asender(struct drbd_thread *thi) received += rv; buf += rv; } else if (rv == 0) { - if (test_bit(DISCONNECT_SENT, &tconn->flags)) { - long t; - rcu_read_lock(); - t = rcu_dereference(tconn->net_conf)->ping_timeo * HZ/10; - rcu_read_unlock(); - - t = wait_event_timeout(tconn->ping_wait, - tconn->cstate < C_WF_REPORT_PARAMS, - t); - if (t) - break; - } - conn_err(tconn, "meta connection shut down by peer.\n"); + dev_err(DEV, "meta connection shut down by peer.\n"); goto reconnect; } else if (rv == -EAGAIN) { /* If the data socket received something meanwhile, * that is good enough: peer is still alive. */ - if (time_after(tconn->last_received, - jiffies - tconn->meta.socket->sk->sk_rcvtimeo)) + if (time_after(mdev->last_received, + jiffies - mdev->meta.socket->sk->sk_rcvtimeo)) continue; if (ping_timeout_active) { - conn_err(tconn, "PingAck did not arrive in time.\n"); + dev_err(DEV, "PingAck did not arrive in time.\n"); goto reconnect; } - set_bit(SEND_PING, &tconn->flags); + set_bit(SEND_PING, &mdev->flags); continue; } else if (rv == -EINTR) { continue; } else { - conn_err(tconn, "sock_recvmsg returned %d\n", rv); + dev_err(DEV, "sock_recvmsg returned %d\n", rv); goto reconnect; } if (received == expect && cmd == NULL) { - if (decode_header(tconn, tconn->meta.rbuf, &pi)) + if (unlikely(h->magic != BE_DRBD_MAGIC)) { + dev_err(DEV, "magic?? on meta m: 0x%08x c: %d l: %d\n", + be32_to_cpu(h->magic), + be16_to_cpu(h->command), + be16_to_cpu(h->length)); goto reconnect; - cmd = &asender_tbl[pi.cmd]; - if (pi.cmd >= ARRAY_SIZE(asender_tbl) || !cmd->fn) { - conn_err(tconn, "Unexpected meta packet %s (0x%04x)\n", - cmdname(pi.cmd), pi.cmd); + } + cmd = get_asender_cmd(be16_to_cpu(h->command)); + len = be16_to_cpu(h->length); + if (unlikely(cmd == NULL)) { + dev_err(DEV, "unknown command?? on meta m: 0x%08x c: %d l: %d\n", + be32_to_cpu(h->magic), + be16_to_cpu(h->command), + be16_to_cpu(h->length)); goto disconnect; } - expect = header_size + cmd->pkt_size; - if (pi.size != expect - header_size) { - conn_err(tconn, "Wrong packet size on meta (c: %d, l: %d)\n", - pi.cmd, pi.size); + expect = cmd->pkt_size; + ERR_IF(len != expect-sizeof(struct p_header80)) goto reconnect; - } } if (received == expect) { - bool err; - - err = cmd->fn(tconn, &pi); - if (err) { - conn_err(tconn, "%pf failed\n", cmd->fn); + mdev->last_received = jiffies; + D_ASSERT(cmd != NULL); + if (!cmd->process(mdev, h)) goto reconnect; - } - tconn->last_received = jiffies; + /* the idle_timeout (ping-int) + * has been restored in got_PingAck() */ + if (cmd == get_asender_cmd(P_PING_ACK)) + ping_timeout_active = 0; - if (cmd == &asender_tbl[P_PING_ACK]) { - /* restore idle timeout */ - tconn->meta.socket->sk->sk_rcvtimeo = ping_int * HZ; - ping_timeout_active = false; - } - - buf = tconn->meta.rbuf; + buf = h; received = 0; - expect = header_size; + expect = sizeof(struct p_header80); cmd = NULL; } } if (0) { reconnect: - conn_request_state(tconn, NS(conn, C_NETWORK_FAILURE), CS_HARD); - conn_md_sync(tconn); + drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE)); + drbd_md_sync(mdev); } if (0) { disconnect: - conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD); + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); + drbd_md_sync(mdev); } - clear_bit(SIGNAL_ASENDER, &tconn->flags); + clear_bit(SIGNAL_ASENDER, &mdev->flags); - conn_info(tconn, "asender terminated\n"); + D_ASSERT(mdev->state.conn < C_CONNECTED); + dev_info(DEV, "asender terminated\n"); return 0; } diff --git a/trunk/drivers/block/drbd/drbd_req.c b/trunk/drivers/block/drbd/drbd_req.c index f58a4a4b4dfb..01b2ac641c7b 100644 --- a/trunk/drivers/block/drbd/drbd_req.c +++ b/trunk/drivers/block/drbd/drbd_req.c @@ -31,8 +31,6 @@ #include "drbd_req.h" -static bool drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size); - /* Update disk stats at start of I/O request */ static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req, struct bio *bio) { @@ -42,8 +40,6 @@ static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req part_round_stats(cpu, &mdev->vdisk->part0); part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]); part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio)); - (void) cpu; /* The macro invocations above want the cpu argument, I do not like - the compiler warning about cpu only assigned but never used... */ part_inc_in_flight(&mdev->vdisk->part0, rw); part_stat_unlock(); } @@ -61,51 +57,9 @@ static void _drbd_end_io_acct(struct drbd_conf *mdev, struct drbd_request *req) part_stat_unlock(); } -static struct drbd_request *drbd_req_new(struct drbd_conf *mdev, - struct bio *bio_src) -{ - struct drbd_request *req; - - req = mempool_alloc(drbd_request_mempool, GFP_NOIO); - if (!req) - return NULL; - - drbd_req_make_private_bio(req, bio_src); - req->rq_state = bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0; - req->w.mdev = mdev; - req->master_bio = bio_src; - req->epoch = 0; - - drbd_clear_interval(&req->i); - req->i.sector = bio_src->bi_sector; - req->i.size = bio_src->bi_size; - req->i.local = true; - req->i.waiting = false; - - INIT_LIST_HEAD(&req->tl_requests); - INIT_LIST_HEAD(&req->w.list); - - /* one reference to be put by __drbd_make_request */ - atomic_set(&req->completion_ref, 1); - /* one kref as long as completion_ref > 0 */ - kref_init(&req->kref); - return req; -} - -void drbd_req_destroy(struct kref *kref) +static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const int rw) { - struct drbd_request *req = container_of(kref, struct drbd_request, kref); - struct drbd_conf *mdev = req->w.mdev; - const unsigned s = req->rq_state; - - if ((req->master_bio && !(s & RQ_POSTPONED)) || - atomic_read(&req->completion_ref) || - (s & RQ_LOCAL_PENDING) || - ((s & RQ_NET_MASK) && !(s & RQ_NET_DONE))) { - dev_err(DEV, "drbd_req_destroy: Logic BUG rq_state = 0x%x, completion_ref = %d\n", - s, atomic_read(&req->completion_ref)); - return; - } + const unsigned long s = req->rq_state; /* remove it from the transfer log. * well, only if it had been there in the first @@ -113,33 +67,24 @@ void drbd_req_destroy(struct kref *kref) * and never sent), it should still be "empty" as * initialized in drbd_req_new(), so we can list_del() it * here unconditionally */ - list_del_init(&req->tl_requests); + list_del(&req->tl_requests); /* if it was a write, we may have to set the corresponding * bit(s) out-of-sync first. If it had a local part, we need to * release the reference to the activity log. */ - if (s & RQ_WRITE) { + if (rw == WRITE) { /* Set out-of-sync unless both OK flags are set * (local only or remote failed). * Other places where we set out-of-sync: * READ with local io-error */ + if (!(s & RQ_NET_OK) || !(s & RQ_LOCAL_OK)) + drbd_set_out_of_sync(mdev, req->sector, req->size); - /* There is a special case: - * we may notice late that IO was suspended, - * and postpone, or schedule for retry, a write, - * before it even was submitted or sent. - * In that case we do not want to touch the bitmap at all. - */ - if ((s & (RQ_POSTPONED|RQ_LOCAL_MASK|RQ_NET_MASK)) != RQ_POSTPONED) { - if (!(s & RQ_NET_OK) || !(s & RQ_LOCAL_OK)) - drbd_set_out_of_sync(mdev, req->i.sector, req->i.size); - - if ((s & RQ_NET_OK) && (s & RQ_LOCAL_OK) && (s & RQ_NET_SIS)) - drbd_set_in_sync(mdev, req->i.sector, req->i.size); - } + if ((s & RQ_NET_OK) && (s & RQ_LOCAL_OK) && (s & RQ_NET_SIS)) + drbd_set_in_sync(mdev, req->sector, req->size); /* one might be tempted to move the drbd_al_complete_io - * to the local io completion callback drbd_request_endio. + * to the local io completion callback drbd_endio_pri. * but, if this was a mirror write, we may only * drbd_al_complete_io after this is RQ_NET_DONE, * otherwise the extent could be dropped from the al @@ -148,35 +93,109 @@ void drbd_req_destroy(struct kref *kref) * but after the extent has been dropped from the al, * we would forget to resync the corresponding extent. */ - if (s & RQ_IN_ACT_LOG) { + if (s & RQ_LOCAL_MASK) { if (get_ldev_if_state(mdev, D_FAILED)) { - drbd_al_complete_io(mdev, &req->i); + if (s & RQ_IN_ACT_LOG) + drbd_al_complete_io(mdev, req->sector); put_ldev(mdev); } else if (__ratelimit(&drbd_ratelimit_state)) { - dev_warn(DEV, "Should have called drbd_al_complete_io(, %llu, %u), " - "but my Disk seems to have failed :(\n", - (unsigned long long) req->i.sector, req->i.size); + dev_warn(DEV, "Should have called drbd_al_complete_io(, %llu), " + "but my Disk seems to have failed :(\n", + (unsigned long long) req->sector); } } } - mempool_free(req, drbd_request_mempool); + drbd_req_free(req); } -static void wake_all_senders(struct drbd_tconn *tconn) { - wake_up(&tconn->sender_work.q_wait); +static void queue_barrier(struct drbd_conf *mdev) +{ + struct drbd_tl_epoch *b; + + /* We are within the req_lock. Once we queued the barrier for sending, + * we set the CREATE_BARRIER bit. It is cleared as soon as a new + * barrier/epoch object is added. This is the only place this bit is + * set. It indicates that the barrier for this epoch is already queued, + * and no new epoch has been created yet. */ + if (test_bit(CREATE_BARRIER, &mdev->flags)) + return; + + b = mdev->newest_tle; + b->w.cb = w_send_barrier; + /* inc_ap_pending done here, so we won't + * get imbalanced on connection loss. + * dec_ap_pending will be done in got_BarrierAck + * or (on connection loss) in tl_clear. */ + inc_ap_pending(mdev); + drbd_queue_work(&mdev->data.work, &b->w); + set_bit(CREATE_BARRIER, &mdev->flags); } -/* must hold resource->req_lock */ -static void start_new_tl_epoch(struct drbd_tconn *tconn) +static void _about_to_complete_local_write(struct drbd_conf *mdev, + struct drbd_request *req) { - /* no point closing an epoch, if it is empty, anyways. */ - if (tconn->current_tle_writes == 0) - return; + const unsigned long s = req->rq_state; + struct drbd_request *i; + struct drbd_epoch_entry *e; + struct hlist_node *n; + struct hlist_head *slot; - tconn->current_tle_writes = 0; - atomic_inc(&tconn->current_tle_nr); - wake_all_senders(tconn); + /* Before we can signal completion to the upper layers, + * we may need to close the current epoch. + * We can skip this, if this request has not even been sent, because we + * did not have a fully established connection yet/anymore, during + * bitmap exchange, or while we are C_AHEAD due to congestion policy. + */ + if (mdev->state.conn >= C_CONNECTED && + (s & RQ_NET_SENT) != 0 && + req->epoch == mdev->newest_tle->br_number) + queue_barrier(mdev); + + /* we need to do the conflict detection stuff, + * if we have the ee_hash (two_primaries) and + * this has been on the network */ + if ((s & RQ_NET_DONE) && mdev->ee_hash != NULL) { + const sector_t sector = req->sector; + const int size = req->size; + + /* ASSERT: + * there must be no conflicting requests, since + * they must have been failed on the spot */ +#define OVERLAPS overlaps(sector, size, i->sector, i->size) + slot = tl_hash_slot(mdev, sector); + hlist_for_each_entry(i, n, slot, collision) { + if (OVERLAPS) { + dev_alert(DEV, "LOGIC BUG: completed: %p %llus +%u; " + "other: %p %llus +%u\n", + req, (unsigned long long)sector, size, + i, (unsigned long long)i->sector, i->size); + } + } + + /* maybe "wake" those conflicting epoch entries + * that wait for this request to finish. + * + * currently, there can be only _one_ such ee + * (well, or some more, which would be pending + * P_DISCARD_ACK not yet sent by the asender...), + * since we block the receiver thread upon the + * first conflict detection, which will wait on + * misc_wait. maybe we want to assert that? + * + * anyways, if we found one, + * we just have to do a wake_up. */ +#undef OVERLAPS +#define OVERLAPS overlaps(sector, size, e->sector, e->size) + slot = ee_hash_slot(mdev, req->sector); + hlist_for_each_entry(e, n, slot, collision) { + if (OVERLAPS) { + wake_up(&mdev->misc_wait); + break; + } + } + } +#undef OVERLAPS } void complete_master_bio(struct drbd_conf *mdev, @@ -186,33 +205,17 @@ void complete_master_bio(struct drbd_conf *mdev, dec_ap_bio(mdev); } - -static void drbd_remove_request_interval(struct rb_root *root, - struct drbd_request *req) -{ - struct drbd_conf *mdev = req->w.mdev; - struct drbd_interval *i = &req->i; - - drbd_remove_interval(root, i); - - /* Wake up any processes waiting for this request to complete. */ - if (i->waiting) - wake_up(&mdev->misc_wait); -} - /* Helper for __req_mod(). * Set m->bio to the master bio, if it is fit to be completed, * or leave it alone (it is initialized to NULL in __req_mod), * if it has already been completed, or cannot be completed yet. * If m->bio is set, the error status to be returned is placed in m->error. */ -static -void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m) +void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m) { - const unsigned s = req->rq_state; - struct drbd_conf *mdev = req->w.mdev; - int rw; - int error, ok; + const unsigned long s = req->rq_state; + struct drbd_conf *mdev = req->mdev; + int rw = req->rq_state & RQ_WRITE ? WRITE : READ; /* we must not complete the master bio, while it is * still being processed by _drbd_send_zc_bio (drbd_send_dblock) @@ -223,220 +226,165 @@ void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m) * the receiver, * the bio_endio completion callbacks. */ - if ((s & RQ_LOCAL_PENDING && !(s & RQ_LOCAL_ABORTED)) || - (s & RQ_NET_QUEUED) || (s & RQ_NET_PENDING) || - (s & RQ_COMPLETION_SUSP)) { - dev_err(DEV, "drbd_req_complete: Logic BUG rq_state = 0x%x\n", s); + if (s & RQ_NET_QUEUED) return; - } - - if (!req->master_bio) { - dev_err(DEV, "drbd_req_complete: Logic BUG, master_bio == NULL!\n"); + if (s & RQ_NET_PENDING) + return; + if (s & RQ_LOCAL_PENDING && !(s & RQ_LOCAL_ABORTED)) return; - } - - rw = bio_rw(req->master_bio); - /* - * figure out whether to report success or failure. - * - * report success when at least one of the operations succeeded. - * or, to put the other way, - * only report failure, when both operations failed. - * - * what to do about the failures is handled elsewhere. - * what we need to do here is just: complete the master_bio. - * - * local completion error, if any, has been stored as ERR_PTR - * in private_bio within drbd_request_endio. - */ - ok = (s & RQ_LOCAL_OK) || (s & RQ_NET_OK); - error = PTR_ERR(req->private_bio); + if (req->master_bio) { + /* this is data_received (remote read) + * or protocol C P_WRITE_ACK + * or protocol B P_RECV_ACK + * or protocol A "handed_over_to_network" (SendAck) + * or canceled or failed, + * or killed from the transfer log due to connection loss. + */ - /* remove the request from the conflict detection - * respective block_id verification hash */ - if (!drbd_interval_empty(&req->i)) { - struct rb_root *root; + /* + * figure out whether to report success or failure. + * + * report success when at least one of the operations succeeded. + * or, to put the other way, + * only report failure, when both operations failed. + * + * what to do about the failures is handled elsewhere. + * what we need to do here is just: complete the master_bio. + * + * local completion error, if any, has been stored as ERR_PTR + * in private_bio within drbd_endio_pri. + */ + int ok = (s & RQ_LOCAL_OK) || (s & RQ_NET_OK); + int error = PTR_ERR(req->private_bio); - if (rw == WRITE) - root = &mdev->write_requests; + /* remove the request from the conflict detection + * respective block_id verification hash */ + if (!hlist_unhashed(&req->collision)) + hlist_del(&req->collision); else - root = &mdev->read_requests; - drbd_remove_request_interval(root, req); - } else if (!(s & RQ_POSTPONED)) - D_ASSERT((s & (RQ_NET_MASK & ~RQ_NET_DONE)) == 0); + D_ASSERT((s & (RQ_NET_MASK & ~RQ_NET_DONE)) == 0); - /* Before we can signal completion to the upper layers, - * we may need to close the current transfer log epoch. - * We are within the request lock, so we can simply compare - * the request epoch number with the current transfer log - * epoch number. If they match, increase the current_tle_nr, - * and reset the transfer log epoch write_cnt. - */ - if (rw == WRITE && - req->epoch == atomic_read(&mdev->tconn->current_tle_nr)) - start_new_tl_epoch(mdev->tconn); - - /* Update disk stats */ - _drbd_end_io_acct(mdev, req); + /* for writes we need to do some extra housekeeping */ + if (rw == WRITE) + _about_to_complete_local_write(mdev, req); - /* If READ failed, - * have it be pushed back to the retry work queue, - * so it will re-enter __drbd_make_request(), - * and be re-assigned to a suitable local or remote path, - * or failed if we do not have access to good data anymore. - * - * Unless it was failed early by __drbd_make_request(), - * because no path was available, in which case - * it was not even added to the transfer_log. - * - * READA may fail, and will not be retried. - * - * WRITE should have used all available paths already. - */ - if (!ok && rw == READ && !list_empty(&req->tl_requests)) - req->rq_state |= RQ_POSTPONED; + /* Update disk stats */ + _drbd_end_io_acct(mdev, req); - if (!(req->rq_state & RQ_POSTPONED)) { m->error = ok ? 0 : (error ?: -EIO); m->bio = req->master_bio; req->master_bio = NULL; } -} - -static int drbd_req_put_completion_ref(struct drbd_request *req, struct bio_and_error *m, int put) -{ - struct drbd_conf *mdev = req->w.mdev; - D_ASSERT(m || (req->rq_state & RQ_POSTPONED)); - - if (!atomic_sub_and_test(put, &req->completion_ref)) - return 0; - drbd_req_complete(req, m); + if (s & RQ_LOCAL_PENDING) + return; - if (req->rq_state & RQ_POSTPONED) { - /* don't destroy the req object just yet, - * but queue it for retry */ - drbd_restart_request(req); - return 0; + if ((s & RQ_NET_MASK) == 0 || (s & RQ_NET_DONE)) { + /* this is disconnected (local only) operation, + * or protocol C P_WRITE_ACK, + * or protocol A or B P_BARRIER_ACK, + * or killed from the transfer log due to connection loss. */ + _req_is_done(mdev, req, rw); } - - return 1; + /* else: network part and not DONE yet. that is + * protocol A or B, barrier ack still pending... */ } -/* I'd like this to be the only place that manipulates - * req->completion_ref and req->kref. */ -static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m, - int clear, int set) +static void _req_may_be_done_not_susp(struct drbd_request *req, struct bio_and_error *m) { - struct drbd_conf *mdev = req->w.mdev; - unsigned s = req->rq_state; - int c_put = 0; - int k_put = 0; - - if (drbd_suspended(mdev) && !((s | clear) & RQ_COMPLETION_SUSP)) - set |= RQ_COMPLETION_SUSP; - - /* apply */ - - req->rq_state &= ~clear; - req->rq_state |= set; - - /* no change? */ - if (req->rq_state == s) - return; - - /* intent: get references */ - - if (!(s & RQ_LOCAL_PENDING) && (set & RQ_LOCAL_PENDING)) - atomic_inc(&req->completion_ref); - - if (!(s & RQ_NET_PENDING) && (set & RQ_NET_PENDING)) { - inc_ap_pending(mdev); - atomic_inc(&req->completion_ref); - } + struct drbd_conf *mdev = req->mdev; - if (!(s & RQ_NET_QUEUED) && (set & RQ_NET_QUEUED)) - atomic_inc(&req->completion_ref); - - if (!(s & RQ_EXP_BARR_ACK) && (set & RQ_EXP_BARR_ACK)) - kref_get(&req->kref); /* wait for the DONE */ - - if (!(s & RQ_NET_SENT) && (set & RQ_NET_SENT)) - atomic_add(req->i.size >> 9, &mdev->ap_in_flight); - - if (!(s & RQ_COMPLETION_SUSP) && (set & RQ_COMPLETION_SUSP)) - atomic_inc(&req->completion_ref); - - /* progress: put references */ - - if ((s & RQ_COMPLETION_SUSP) && (clear & RQ_COMPLETION_SUSP)) - ++c_put; - - if (!(s & RQ_LOCAL_ABORTED) && (set & RQ_LOCAL_ABORTED)) { - D_ASSERT(req->rq_state & RQ_LOCAL_PENDING); - /* local completion may still come in later, - * we need to keep the req object around. */ - kref_get(&req->kref); - ++c_put; - } + if (!is_susp(mdev->state)) + _req_may_be_done(req, m); +} - if ((s & RQ_LOCAL_PENDING) && (clear & RQ_LOCAL_PENDING)) { - if (req->rq_state & RQ_LOCAL_ABORTED) - ++k_put; - else - ++c_put; - } +/* + * checks whether there was an overlapping request + * or ee already registered. + * + * if so, return 1, in which case this request is completed on the spot, + * without ever being submitted or send. + * + * return 0 if it is ok to submit this request. + * + * NOTE: + * paranoia: assume something above us is broken, and issues different write + * requests for the same block simultaneously... + * + * To ensure these won't be reordered differently on both nodes, resulting in + * diverging data sets, we discard the later one(s). Not that this is supposed + * to happen, but this is the rationale why we also have to check for + * conflicting requests with local origin, and why we have to do so regardless + * of whether we allowed multiple primaries. + * + * BTW, in case we only have one primary, the ee_hash is empty anyways, and the + * second hlist_for_each_entry becomes a noop. This is even simpler than to + * grab a reference on the net_conf, and check for the two_primaries flag... + */ +static int _req_conflicts(struct drbd_request *req) +{ + struct drbd_conf *mdev = req->mdev; + const sector_t sector = req->sector; + const int size = req->size; + struct drbd_request *i; + struct drbd_epoch_entry *e; + struct hlist_node *n; + struct hlist_head *slot; - if ((s & RQ_NET_PENDING) && (clear & RQ_NET_PENDING)) { - dec_ap_pending(mdev); - ++c_put; - } + D_ASSERT(hlist_unhashed(&req->collision)); - if ((s & RQ_NET_QUEUED) && (clear & RQ_NET_QUEUED)) - ++c_put; + if (!get_net_conf(mdev)) + return 0; - if ((s & RQ_EXP_BARR_ACK) && !(s & RQ_NET_DONE) && (set & RQ_NET_DONE)) { - if (req->rq_state & RQ_NET_SENT) - atomic_sub(req->i.size >> 9, &mdev->ap_in_flight); - ++k_put; + /* BUG_ON */ + ERR_IF (mdev->tl_hash_s == 0) + goto out_no_conflict; + BUG_ON(mdev->tl_hash == NULL); + +#define OVERLAPS overlaps(i->sector, i->size, sector, size) + slot = tl_hash_slot(mdev, sector); + hlist_for_each_entry(i, n, slot, collision) { + if (OVERLAPS) { + dev_alert(DEV, "%s[%u] Concurrent local write detected! " + "[DISCARD L] new: %llus +%u; " + "pending: %llus +%u\n", + current->comm, current->pid, + (unsigned long long)sector, size, + (unsigned long long)i->sector, i->size); + goto out_conflict; + } } - /* potentially complete and destroy */ - - if (k_put || c_put) { - /* Completion does it's own kref_put. If we are going to - * kref_sub below, we need req to be still around then. */ - int at_least = k_put + !!c_put; - int refcount = atomic_read(&req->kref.refcount); - if (refcount < at_least) - dev_err(DEV, - "mod_rq_state: Logic BUG: %x -> %x: refcount = %d, should be >= %d\n", - s, req->rq_state, refcount, at_least); + if (mdev->ee_hash_s) { + /* now, check for overlapping requests with remote origin */ + BUG_ON(mdev->ee_hash == NULL); +#undef OVERLAPS +#define OVERLAPS overlaps(e->sector, e->size, sector, size) + slot = ee_hash_slot(mdev, sector); + hlist_for_each_entry(e, n, slot, collision) { + if (OVERLAPS) { + dev_alert(DEV, "%s[%u] Concurrent remote write detected!" + " [DISCARD L] new: %llus +%u; " + "pending: %llus +%u\n", + current->comm, current->pid, + (unsigned long long)sector, size, + (unsigned long long)e->sector, e->size); + goto out_conflict; + } + } } +#undef OVERLAPS - /* If we made progress, retry conflicting peer requests, if any. */ - if (req->i.waiting) - wake_up(&mdev->misc_wait); - - if (c_put) - k_put += drbd_req_put_completion_ref(req, m, c_put); - if (k_put) - kref_sub(&req->kref, k_put, drbd_req_destroy); -} - -static void drbd_report_io_error(struct drbd_conf *mdev, struct drbd_request *req) -{ - char b[BDEVNAME_SIZE]; - - if (!__ratelimit(&drbd_ratelimit_state)) - return; +out_no_conflict: + /* this is like it should be, and what we expected. + * our users do behave after all... */ + put_net_conf(mdev); + return 0; - dev_warn(DEV, "local %s IO error sector %llu+%u on %s\n", - (req->rq_state & RQ_WRITE) ? "WRITE" : "READ", - (unsigned long long)req->i.sector, - req->i.size >> 9, - bdevname(mdev->ldev->backing_bdev, b)); +out_conflict: + put_net_conf(mdev); + return 1; } /* obviously this could be coded as many single functions @@ -454,12 +402,9 @@ static void drbd_report_io_error(struct drbd_conf *mdev, struct drbd_request *re int __req_mod(struct drbd_request *req, enum drbd_req_event what, struct bio_and_error *m) { - struct drbd_conf *mdev = req->w.mdev; - struct net_conf *nc; - int p, rv = 0; - - if (m) - m->bio = NULL; + struct drbd_conf *mdev = req->mdev; + int rv = 0; + m->bio = NULL; switch (what) { default: @@ -468,91 +413,116 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, /* does not happen... * initialization done in drbd_req_new - case CREATED: + case created: break; */ - case TO_BE_SENT: /* via network */ - /* reached via __drbd_make_request + case to_be_send: /* via network */ + /* reached via drbd_make_request_common * and from w_read_retry_remote */ D_ASSERT(!(req->rq_state & RQ_NET_MASK)); - rcu_read_lock(); - nc = rcu_dereference(mdev->tconn->net_conf); - p = nc->wire_protocol; - rcu_read_unlock(); - req->rq_state |= - p == DRBD_PROT_C ? RQ_EXP_WRITE_ACK : - p == DRBD_PROT_B ? RQ_EXP_RECEIVE_ACK : 0; - mod_rq_state(req, m, 0, RQ_NET_PENDING); + req->rq_state |= RQ_NET_PENDING; + inc_ap_pending(mdev); break; - case TO_BE_SUBMITTED: /* locally */ - /* reached via __drbd_make_request */ + case to_be_submitted: /* locally */ + /* reached via drbd_make_request_common */ D_ASSERT(!(req->rq_state & RQ_LOCAL_MASK)); - mod_rq_state(req, m, 0, RQ_LOCAL_PENDING); + req->rq_state |= RQ_LOCAL_PENDING; break; - case COMPLETED_OK: + case completed_ok: if (req->rq_state & RQ_WRITE) - mdev->writ_cnt += req->i.size >> 9; + mdev->writ_cnt += req->size>>9; else - mdev->read_cnt += req->i.size >> 9; + mdev->read_cnt += req->size>>9; - mod_rq_state(req, m, RQ_LOCAL_PENDING, - RQ_LOCAL_COMPLETED|RQ_LOCAL_OK); + req->rq_state |= (RQ_LOCAL_COMPLETED|RQ_LOCAL_OK); + req->rq_state &= ~RQ_LOCAL_PENDING; + + _req_may_be_done_not_susp(req, m); break; - case ABORT_DISK_IO: - mod_rq_state(req, m, 0, RQ_LOCAL_ABORTED); + case abort_disk_io: + req->rq_state |= RQ_LOCAL_ABORTED; + if (req->rq_state & RQ_WRITE) + _req_may_be_done_not_susp(req, m); + else + goto goto_queue_for_net_read; break; - case WRITE_COMPLETED_WITH_ERROR: - drbd_report_io_error(mdev, req); - __drbd_chk_io_error(mdev, DRBD_WRITE_ERROR); - mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED); + case write_completed_with_error: + req->rq_state |= RQ_LOCAL_COMPLETED; + req->rq_state &= ~RQ_LOCAL_PENDING; + + __drbd_chk_io_error(mdev, DRBD_IO_ERROR); + _req_may_be_done_not_susp(req, m); break; - case READ_COMPLETED_WITH_ERROR: - drbd_set_out_of_sync(mdev, req->i.sector, req->i.size); - drbd_report_io_error(mdev, req); - __drbd_chk_io_error(mdev, DRBD_READ_ERROR); - /* fall through. */ - case READ_AHEAD_COMPLETED_WITH_ERROR: - /* it is legal to fail READA, no __drbd_chk_io_error in that case. */ - mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED); + case read_ahead_completed_with_error: + /* it is legal to fail READA */ + req->rq_state |= RQ_LOCAL_COMPLETED; + req->rq_state &= ~RQ_LOCAL_PENDING; + _req_may_be_done_not_susp(req, m); break; - case QUEUE_FOR_NET_READ: + case read_completed_with_error: + drbd_set_out_of_sync(mdev, req->sector, req->size); + + req->rq_state |= RQ_LOCAL_COMPLETED; + req->rq_state &= ~RQ_LOCAL_PENDING; + + if (req->rq_state & RQ_LOCAL_ABORTED) { + _req_may_be_done(req, m); + break; + } + + __drbd_chk_io_error(mdev, DRBD_IO_ERROR); + + goto_queue_for_net_read: + + D_ASSERT(!(req->rq_state & RQ_NET_MASK)); + + /* no point in retrying if there is no good remote data, + * or we have no connection. */ + if (mdev->state.pdsk != D_UP_TO_DATE) { + _req_may_be_done_not_susp(req, m); + break; + } + + /* _req_mod(req,to_be_send); oops, recursion... */ + req->rq_state |= RQ_NET_PENDING; + inc_ap_pending(mdev); + /* fall through: _req_mod(req,queue_for_net_read); */ + + case queue_for_net_read: /* READ or READA, and * no local disk, * or target area marked as invalid, * or just got an io-error. */ - /* from __drbd_make_request + /* from drbd_make_request_common * or from bio_endio during read io-error recovery */ - /* So we can verify the handle in the answer packet. - * Corresponding drbd_remove_request_interval is in - * drbd_req_complete() */ - D_ASSERT(drbd_interval_empty(&req->i)); - drbd_insert_interval(&mdev->read_requests, &req->i); + /* so we can verify the handle in the answer packet + * corresponding hlist_del is in _req_may_be_done() */ + hlist_add_head(&req->collision, ar_hash_slot(mdev, req->sector)); set_bit(UNPLUG_REMOTE, &mdev->flags); D_ASSERT(req->rq_state & RQ_NET_PENDING); - D_ASSERT((req->rq_state & RQ_LOCAL_MASK) == 0); - mod_rq_state(req, m, 0, RQ_NET_QUEUED); - req->w.cb = w_send_read_req; - drbd_queue_work(&mdev->tconn->sender_work, &req->w); + req->rq_state |= RQ_NET_QUEUED; + req->w.cb = (req->rq_state & RQ_LOCAL_MASK) + ? w_read_retry_remote + : w_send_read_req; + drbd_queue_work(&mdev->data.work, &req->w); break; - case QUEUE_FOR_NET_WRITE: + case queue_for_net_write: /* assert something? */ - /* from __drbd_make_request only */ + /* from drbd_make_request_common only */ - /* Corresponding drbd_remove_request_interval is in - * drbd_req_complete() */ - D_ASSERT(drbd_interval_empty(&req->i)); - drbd_insert_interval(&mdev->write_requests, &req->i); + hlist_add_head(&req->collision, tl_hash_slot(mdev, req->sector)); + /* corresponding hlist_del is in _req_may_be_done() */ /* NOTE * In case the req ended up on the transfer log before being @@ -563,7 +533,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, * * _req_add_to_epoch(req); this has to be after the * _maybe_start_new_epoch(req); which happened in - * __drbd_make_request, because we now may set the bit + * drbd_make_request_common, because we now may set the bit * again ourselves to close the current epoch. * * Add req to the (now) current epoch (barrier). */ @@ -573,187 +543,202 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, * hurting performance. */ set_bit(UNPLUG_REMOTE, &mdev->flags); + /* see drbd_make_request_common, + * just after it grabs the req_lock */ + D_ASSERT(test_bit(CREATE_BARRIER, &mdev->flags) == 0); + + req->epoch = mdev->newest_tle->br_number; + + /* increment size of current epoch */ + mdev->newest_tle->n_writes++; + /* queue work item to send data */ D_ASSERT(req->rq_state & RQ_NET_PENDING); - mod_rq_state(req, m, 0, RQ_NET_QUEUED|RQ_EXP_BARR_ACK); + req->rq_state |= RQ_NET_QUEUED; req->w.cb = w_send_dblock; - drbd_queue_work(&mdev->tconn->sender_work, &req->w); + drbd_queue_work(&mdev->data.work, &req->w); /* close the epoch, in case it outgrew the limit */ - rcu_read_lock(); - nc = rcu_dereference(mdev->tconn->net_conf); - p = nc->max_epoch_size; - rcu_read_unlock(); - if (mdev->tconn->current_tle_writes >= p) - start_new_tl_epoch(mdev->tconn); + if (mdev->newest_tle->n_writes >= mdev->net_conf->max_epoch_size) + queue_barrier(mdev); break; - case QUEUE_FOR_SEND_OOS: - mod_rq_state(req, m, 0, RQ_NET_QUEUED); - req->w.cb = w_send_out_of_sync; - drbd_queue_work(&mdev->tconn->sender_work, &req->w); + case queue_for_send_oos: + req->rq_state |= RQ_NET_QUEUED; + req->w.cb = w_send_oos; + drbd_queue_work(&mdev->data.work, &req->w); break; - case READ_RETRY_REMOTE_CANCELED: - case SEND_CANCELED: - case SEND_FAILED: + case read_retry_remote_canceled: + case send_canceled: + case send_failed: /* real cleanup will be done from tl_clear. just update flags * so it is no longer marked as on the worker queue */ - mod_rq_state(req, m, RQ_NET_QUEUED, 0); + req->rq_state &= ~RQ_NET_QUEUED; + /* if we did it right, tl_clear should be scheduled only after + * this, so this should not be necessary! */ + _req_may_be_done_not_susp(req, m); break; - case HANDED_OVER_TO_NETWORK: + case handed_over_to_network: /* assert something? */ + if (bio_data_dir(req->master_bio) == WRITE) + atomic_add(req->size>>9, &mdev->ap_in_flight); + if (bio_data_dir(req->master_bio) == WRITE && - !(req->rq_state & (RQ_EXP_RECEIVE_ACK | RQ_EXP_WRITE_ACK))) { + mdev->net_conf->wire_protocol == DRBD_PROT_A) { /* this is what is dangerous about protocol A: * pretend it was successfully written on the peer. */ - if (req->rq_state & RQ_NET_PENDING) - mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK); - /* else: neg-ack was faster... */ + if (req->rq_state & RQ_NET_PENDING) { + dec_ap_pending(mdev); + req->rq_state &= ~RQ_NET_PENDING; + req->rq_state |= RQ_NET_OK; + } /* else: neg-ack was faster... */ /* it is still not yet RQ_NET_DONE until the * corresponding epoch barrier got acked as well, * so we know what to dirty on connection loss */ } - mod_rq_state(req, m, RQ_NET_QUEUED, RQ_NET_SENT); + req->rq_state &= ~RQ_NET_QUEUED; + req->rq_state |= RQ_NET_SENT; + _req_may_be_done_not_susp(req, m); break; - case OOS_HANDED_TO_NETWORK: + case oos_handed_to_network: /* Was not set PENDING, no longer QUEUED, so is now DONE * as far as this connection is concerned. */ - mod_rq_state(req, m, RQ_NET_QUEUED, RQ_NET_DONE); + req->rq_state &= ~RQ_NET_QUEUED; + req->rq_state |= RQ_NET_DONE; + _req_may_be_done_not_susp(req, m); break; - case CONNECTION_LOST_WHILE_PENDING: + case connection_lost_while_pending: /* transfer log cleanup after connection loss */ - mod_rq_state(req, m, - RQ_NET_OK|RQ_NET_PENDING|RQ_COMPLETION_SUSP, - RQ_NET_DONE); + /* assert something? */ + if (req->rq_state & RQ_NET_PENDING) + dec_ap_pending(mdev); + req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING); + req->rq_state |= RQ_NET_DONE; + if (req->rq_state & RQ_NET_SENT && req->rq_state & RQ_WRITE) + atomic_sub(req->size>>9, &mdev->ap_in_flight); + + /* if it is still queued, we may not complete it here. + * it will be canceled soon. */ + if (!(req->rq_state & RQ_NET_QUEUED)) + _req_may_be_done(req, m); /* Allowed while state.susp */ break; - case CONFLICT_RESOLVED: - /* for superseded conflicting writes of multiple primaries, + case conflict_discarded_by_peer: + /* for discarded conflicting writes of multiple primaries, * there is no need to keep anything in the tl, potential - * node crashes are covered by the activity log. - * - * If this request had been marked as RQ_POSTPONED before, - * it will actually not be completed, but "restarted", - * resubmitted from the retry worker context. */ - D_ASSERT(req->rq_state & RQ_NET_PENDING); - D_ASSERT(req->rq_state & RQ_EXP_WRITE_ACK); - mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_DONE|RQ_NET_OK); - break; - - case WRITE_ACKED_BY_PEER_AND_SIS: - req->rq_state |= RQ_NET_SIS; - case WRITE_ACKED_BY_PEER: - D_ASSERT(req->rq_state & RQ_EXP_WRITE_ACK); + * node crashes are covered by the activity log. */ + if (what == conflict_discarded_by_peer) + dev_alert(DEV, "Got DiscardAck packet %llus +%u!" + " DRBD is not a random data generator!\n", + (unsigned long long)req->sector, req->size); + req->rq_state |= RQ_NET_DONE; + /* fall through */ + case write_acked_by_peer_and_sis: + case write_acked_by_peer: + if (what == write_acked_by_peer_and_sis) + req->rq_state |= RQ_NET_SIS; /* protocol C; successfully written on peer. * Nothing more to do here. * We want to keep the tl in place for all protocols, to cater * for volatile write-back caches on lower level devices. */ - goto ack_common; - case RECV_ACKED_BY_PEER: - D_ASSERT(req->rq_state & RQ_EXP_RECEIVE_ACK); + case recv_acked_by_peer: /* protocol B; pretends to be successfully written on peer. - * see also notes above in HANDED_OVER_TO_NETWORK about + * see also notes above in handed_over_to_network about * protocol != C */ - ack_common: + req->rq_state |= RQ_NET_OK; D_ASSERT(req->rq_state & RQ_NET_PENDING); - mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK); + dec_ap_pending(mdev); + atomic_sub(req->size>>9, &mdev->ap_in_flight); + req->rq_state &= ~RQ_NET_PENDING; + _req_may_be_done_not_susp(req, m); break; - case POSTPONE_WRITE: - D_ASSERT(req->rq_state & RQ_EXP_WRITE_ACK); - /* If this node has already detected the write conflict, the - * worker will be waiting on misc_wait. Wake it up once this - * request has completed locally. - */ - D_ASSERT(req->rq_state & RQ_NET_PENDING); - req->rq_state |= RQ_POSTPONED; - if (req->i.waiting) - wake_up(&mdev->misc_wait); - /* Do not clear RQ_NET_PENDING. This request will make further - * progress via restart_conflicting_writes() or - * fail_postponed_requests(). Hopefully. */ - break; + case neg_acked: + /* assert something? */ + if (req->rq_state & RQ_NET_PENDING) { + dec_ap_pending(mdev); + atomic_sub(req->size>>9, &mdev->ap_in_flight); + } + req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING); - case NEG_ACKED: - mod_rq_state(req, m, RQ_NET_OK|RQ_NET_PENDING, 0); + req->rq_state |= RQ_NET_DONE; + _req_may_be_done_not_susp(req, m); + /* else: done by handed_over_to_network */ break; - case FAIL_FROZEN_DISK_IO: + case fail_frozen_disk_io: if (!(req->rq_state & RQ_LOCAL_COMPLETED)) break; - mod_rq_state(req, m, RQ_COMPLETION_SUSP, 0); + + _req_may_be_done(req, m); /* Allowed while state.susp */ break; - case RESTART_FROZEN_DISK_IO: + case restart_frozen_disk_io: if (!(req->rq_state & RQ_LOCAL_COMPLETED)) break; - mod_rq_state(req, m, - RQ_COMPLETION_SUSP|RQ_LOCAL_COMPLETED, - RQ_LOCAL_PENDING); + req->rq_state &= ~RQ_LOCAL_COMPLETED; rv = MR_READ; if (bio_data_dir(req->master_bio) == WRITE) rv = MR_WRITE; - get_ldev(mdev); /* always succeeds in this call path */ + get_ldev(mdev); req->w.cb = w_restart_disk_io; - drbd_queue_work(&mdev->tconn->sender_work, &req->w); + drbd_queue_work(&mdev->data.work, &req->w); break; - case RESEND: + case resend: /* Simply complete (local only) READs. */ if (!(req->rq_state & RQ_WRITE) && !req->w.cb) { - mod_rq_state(req, m, RQ_COMPLETION_SUSP, 0); + _req_may_be_done(req, m); break; } /* If RQ_NET_OK is already set, we got a P_WRITE_ACK or P_RECV_ACK - before the connection loss (B&C only); only P_BARRIER_ACK - (or the local completion?) was missing when we suspended. - Throwing them out of the TL here by pretending we got a BARRIER_ACK. - During connection handshake, we ensure that the peer was not rebooted. */ + before the connection loss (B&C only); only P_BARRIER_ACK was missing. + Trowing them out of the TL here by pretending we got a BARRIER_ACK + We ensure that the peer was not rebooted */ if (!(req->rq_state & RQ_NET_OK)) { - /* FIXME could this possibly be a req->w.cb == w_send_out_of_sync? - * in that case we must not set RQ_NET_PENDING. */ - - mod_rq_state(req, m, RQ_COMPLETION_SUSP, RQ_NET_QUEUED|RQ_NET_PENDING); if (req->w.cb) { - drbd_queue_work(&mdev->tconn->sender_work, &req->w); + drbd_queue_work(&mdev->data.work, &req->w); rv = req->rq_state & RQ_WRITE ? MR_WRITE : MR_READ; - } /* else: FIXME can this happen? */ + } break; } - /* else, fall through to BARRIER_ACKED */ + /* else, fall through to barrier_acked */ - case BARRIER_ACKED: - /* barrier ack for READ requests does not make sense */ + case barrier_acked: if (!(req->rq_state & RQ_WRITE)) break; if (req->rq_state & RQ_NET_PENDING) { - /* barrier came in before all requests were acked. + /* barrier came in before all requests have been acked. * this is bad, because if the connection is lost now, * we won't be able to clean them up... */ - dev_err(DEV, "FIXME (BARRIER_ACKED but pending)\n"); + dev_err(DEV, "FIXME (barrier_acked but pending)\n"); + list_move(&req->tl_requests, &mdev->out_of_sequence_requests); } - /* Allowed to complete requests, even while suspended. - * As this is called for all requests within a matching epoch, - * we need to filter, and only set RQ_NET_DONE for those that - * have actually been on the wire. */ - mod_rq_state(req, m, RQ_COMPLETION_SUSP, - (req->rq_state & RQ_NET_MASK) ? RQ_NET_DONE : 0); + if ((req->rq_state & RQ_NET_MASK) != 0) { + req->rq_state |= RQ_NET_DONE; + if (mdev->net_conf->wire_protocol == DRBD_PROT_A) + atomic_sub(req->size>>9, &mdev->ap_in_flight); + } + _req_may_be_done(req, m); /* Allowed while state.susp */ break; - case DATA_RECEIVED: + case data_received: D_ASSERT(req->rq_state & RQ_NET_PENDING); - mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK|RQ_NET_DONE); + dec_ap_pending(mdev); + req->rq_state &= ~RQ_NET_PENDING; + req->rq_state |= (RQ_NET_OK|RQ_NET_DONE); + _req_may_be_done_not_susp(req, m); break; }; @@ -767,265 +752,75 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, * since size may be bigger than BM_BLOCK_SIZE, * we may need to check several bits. */ -static bool drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size) +static int drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size) { unsigned long sbnr, ebnr; sector_t esector, nr_sectors; if (mdev->state.disk == D_UP_TO_DATE) - return true; - if (mdev->state.disk != D_INCONSISTENT) - return false; - esector = sector + (size >> 9) - 1; + return 1; + if (mdev->state.disk >= D_OUTDATED) + return 0; + if (mdev->state.disk < D_INCONSISTENT) + return 0; + /* state.disk == D_INCONSISTENT We will have a look at the BitMap */ nr_sectors = drbd_get_capacity(mdev->this_bdev); + esector = sector + (size >> 9) - 1; + D_ASSERT(sector < nr_sectors); D_ASSERT(esector < nr_sectors); sbnr = BM_SECT_TO_BIT(sector); ebnr = BM_SECT_TO_BIT(esector); - return drbd_bm_count_bits(mdev, sbnr, ebnr) == 0; -} - -static bool remote_due_to_read_balancing(struct drbd_conf *mdev, sector_t sector, - enum drbd_read_balancing rbm) -{ - struct backing_dev_info *bdi; - int stripe_shift; - - switch (rbm) { - case RB_CONGESTED_REMOTE: - bdi = &mdev->ldev->backing_bdev->bd_disk->queue->backing_dev_info; - return bdi_read_congested(bdi); - case RB_LEAST_PENDING: - return atomic_read(&mdev->local_cnt) > - atomic_read(&mdev->ap_pending_cnt) + atomic_read(&mdev->rs_pending_cnt); - case RB_32K_STRIPING: /* stripe_shift = 15 */ - case RB_64K_STRIPING: - case RB_128K_STRIPING: - case RB_256K_STRIPING: - case RB_512K_STRIPING: - case RB_1M_STRIPING: /* stripe_shift = 20 */ - stripe_shift = (rbm - RB_32K_STRIPING + 15); - return (sector >> (stripe_shift - 9)) & 1; - case RB_ROUND_ROBIN: - return test_and_change_bit(READ_BALANCE_RR, &mdev->flags); - case RB_PREFER_REMOTE: - return true; - case RB_PREFER_LOCAL: - default: - return false; - } -} - -/* - * complete_conflicting_writes - wait for any conflicting write requests - * - * The write_requests tree contains all active write requests which we - * currently know about. Wait for any requests to complete which conflict with - * the new one. - * - * Only way out: remove the conflicting intervals from the tree. - */ -static void complete_conflicting_writes(struct drbd_request *req) -{ - DEFINE_WAIT(wait); - struct drbd_conf *mdev = req->w.mdev; - struct drbd_interval *i; - sector_t sector = req->i.sector; - int size = req->i.size; - - i = drbd_find_overlap(&mdev->write_requests, sector, size); - if (!i) - return; - - for (;;) { - prepare_to_wait(&mdev->misc_wait, &wait, TASK_UNINTERRUPTIBLE); - i = drbd_find_overlap(&mdev->write_requests, sector, size); - if (!i) - break; - /* Indicate to wake up device->misc_wait on progress. */ - i->waiting = true; - spin_unlock_irq(&mdev->tconn->req_lock); - schedule(); - spin_lock_irq(&mdev->tconn->req_lock); - } - finish_wait(&mdev->misc_wait, &wait); + return 0 == drbd_bm_count_bits(mdev, sbnr, ebnr); } -/* called within req_lock and rcu_read_lock() */ static void maybe_pull_ahead(struct drbd_conf *mdev) { - struct drbd_tconn *tconn = mdev->tconn; - struct net_conf *nc; - bool congested = false; - enum drbd_on_congestion on_congestion; - - nc = rcu_dereference(tconn->net_conf); - on_congestion = nc ? nc->on_congestion : OC_BLOCK; - if (on_congestion == OC_BLOCK || - tconn->agreed_pro_version < 96) - return; + int congested = 0; /* If I don't even have good local storage, we can not reasonably try * to pull ahead of the peer. We also need the local reference to make * sure mdev->act_log is there. + * Note: caller has to make sure that net_conf is there. */ if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) return; - if (nc->cong_fill && - atomic_read(&mdev->ap_in_flight) >= nc->cong_fill) { + if (mdev->net_conf->cong_fill && + atomic_read(&mdev->ap_in_flight) >= mdev->net_conf->cong_fill) { dev_info(DEV, "Congestion-fill threshold reached\n"); - congested = true; + congested = 1; } - if (mdev->act_log->used >= nc->cong_extents) { + if (mdev->act_log->used >= mdev->net_conf->cong_extents) { dev_info(DEV, "Congestion-extents threshold reached\n"); - congested = true; + congested = 1; } if (congested) { - /* start a new epoch for non-mirrored writes */ - start_new_tl_epoch(mdev->tconn); + queue_barrier(mdev); /* last barrier, after mirrored writes */ - if (on_congestion == OC_PULL_AHEAD) + if (mdev->net_conf->on_congestion == OC_PULL_AHEAD) _drbd_set_state(_NS(mdev, conn, C_AHEAD), 0, NULL); - else /*nc->on_congestion == OC_DISCONNECT */ + else /*mdev->net_conf->on_congestion == OC_DISCONNECT */ _drbd_set_state(_NS(mdev, conn, C_DISCONNECTING), 0, NULL); } put_ldev(mdev); } -/* If this returns false, and req->private_bio is still set, - * this should be submitted locally. - * - * If it returns false, but req->private_bio is not set, - * we do not have access to good data :( - * - * Otherwise, this destroys req->private_bio, if any, - * and returns true. - */ -static bool do_remote_read(struct drbd_request *req) -{ - struct drbd_conf *mdev = req->w.mdev; - enum drbd_read_balancing rbm; - - if (req->private_bio) { - if (!drbd_may_do_local_read(mdev, - req->i.sector, req->i.size)) { - bio_put(req->private_bio); - req->private_bio = NULL; - put_ldev(mdev); - } - } - - if (mdev->state.pdsk != D_UP_TO_DATE) - return false; - - if (req->private_bio == NULL) - return true; - - /* TODO: improve read balancing decisions, take into account drbd - * protocol, pending requests etc. */ - - rcu_read_lock(); - rbm = rcu_dereference(mdev->ldev->disk_conf)->read_balancing; - rcu_read_unlock(); - - if (rbm == RB_PREFER_LOCAL && req->private_bio) - return false; /* submit locally */ - - if (remote_due_to_read_balancing(mdev, req->i.sector, rbm)) { - if (req->private_bio) { - bio_put(req->private_bio); - req->private_bio = NULL; - put_ldev(mdev); - } - return true; - } - - return false; -} - -/* returns number of connections (== 1, for drbd 8.4) - * expected to actually write this data, - * which does NOT include those that we are L_AHEAD for. */ -static int drbd_process_write_request(struct drbd_request *req) -{ - struct drbd_conf *mdev = req->w.mdev; - int remote, send_oos; - - rcu_read_lock(); - remote = drbd_should_do_remote(mdev->state); - if (remote) { - maybe_pull_ahead(mdev); - remote = drbd_should_do_remote(mdev->state); - } - send_oos = drbd_should_send_out_of_sync(mdev->state); - rcu_read_unlock(); - - /* Need to replicate writes. Unless it is an empty flush, - * which is better mapped to a DRBD P_BARRIER packet, - * also for drbd wire protocol compatibility reasons. - * If this was a flush, just start a new epoch. - * Unless the current epoch was empty anyways, or we are not currently - * replicating, in which case there is no point. */ - if (unlikely(req->i.size == 0)) { - /* The only size==0 bios we expect are empty flushes. */ - D_ASSERT(req->master_bio->bi_rw & REQ_FLUSH); - if (remote) - start_new_tl_epoch(mdev->tconn); - return 0; - } - - if (!remote && !send_oos) - return 0; - - D_ASSERT(!(remote && send_oos)); - - if (remote) { - _req_mod(req, TO_BE_SENT); - _req_mod(req, QUEUE_FOR_NET_WRITE); - } else if (drbd_set_out_of_sync(mdev, req->i.sector, req->i.size)) - _req_mod(req, QUEUE_FOR_SEND_OOS); - - return remote; -} - -static void -drbd_submit_req_private_bio(struct drbd_request *req) -{ - struct drbd_conf *mdev = req->w.mdev; - struct bio *bio = req->private_bio; - const int rw = bio_rw(bio); - - bio->bi_bdev = mdev->ldev->backing_bdev; - - /* State may have changed since we grabbed our reference on the - * ->ldev member. Double check, and short-circuit to endio. - * In case the last activity log transaction failed to get on - * stable storage, and this is a WRITE, we may not even submit - * this bio. */ - if (get_ldev(mdev)) { - if (drbd_insert_fault(mdev, - rw == WRITE ? DRBD_FAULT_DT_WR - : rw == READ ? DRBD_FAULT_DT_RD - : DRBD_FAULT_DT_RA)) - bio_endio(bio, -EIO); - else - generic_make_request(bio); - put_ldev(mdev); - } else - bio_endio(bio, -EIO); -} - -void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time) +static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time) { const int rw = bio_rw(bio); - struct bio_and_error m = { NULL, }; + const int size = bio->bi_size; + const sector_t sector = bio->bi_sector; + struct drbd_tl_epoch *b = NULL; struct drbd_request *req; - bool no_remote = false; + int local, remote, send_oos = 0; + int err = -EIO; + int ret = 0; + union drbd_state s; /* allocate outside of all locks; */ req = drbd_req_new(mdev, bio); @@ -1035,14 +830,55 @@ void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long * if user cannot handle io errors, that's not our business. */ dev_err(DEV, "could not kmalloc() req\n"); bio_endio(bio, -ENOMEM); - return; + return 0; } req->start_time = start_time; - if (!get_ldev(mdev)) { - bio_put(req->private_bio); + local = get_ldev(mdev); + if (!local) { + bio_put(req->private_bio); /* or we get a bio leak */ req->private_bio = NULL; } + if (rw == WRITE) { + /* Need to replicate writes. Unless it is an empty flush, + * which is better mapped to a DRBD P_BARRIER packet, + * also for drbd wire protocol compatibility reasons. */ + if (unlikely(size == 0)) { + /* The only size==0 bios we expect are empty flushes. */ + D_ASSERT(bio->bi_rw & REQ_FLUSH); + remote = 0; + } else + remote = 1; + } else { + /* READ || READA */ + if (local) { + if (!drbd_may_do_local_read(mdev, sector, size)) { + /* we could kick the syncer to + * sync this extent asap, wait for + * it, then continue locally. + * Or just issue the request remotely. + */ + local = 0; + bio_put(req->private_bio); + req->private_bio = NULL; + put_ldev(mdev); + } + } + remote = !local && mdev->state.pdsk >= D_UP_TO_DATE; + } + + /* If we have a disk, but a READA request is mapped to remote, + * we are R_PRIMARY, D_INCONSISTENT, SyncTarget. + * Just fail that READA request right here. + * + * THINK: maybe fail all READA when not local? + * or make this configurable... + * if network is slow, READA won't do any good. + */ + if (rw == READA && mdev->state.disk >= D_INCONSISTENT && !local) { + err = -EWOULDBLOCK; + goto fail_and_free_req; + } /* For WRITES going to the local disk, grab a reference on the target * extent. This waits for any resync activity in the corresponding @@ -1051,131 +887,348 @@ void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long * of transactional on-disk meta data updates. * Empty flushes don't need to go into the activity log, they can only * flush data for pending writes which are already in there. */ - if (rw == WRITE && req->private_bio && req->i.size + if (rw == WRITE && local && size && !test_bit(AL_SUSPENDED, &mdev->flags)) { req->rq_state |= RQ_IN_ACT_LOG; - drbd_al_begin_io(mdev, &req->i); + drbd_al_begin_io(mdev, sector); } - spin_lock_irq(&mdev->tconn->req_lock); - if (rw == WRITE) { - /* This may temporarily give up the req_lock, - * but will re-aquire it before it returns here. - * Needs to be before the check on drbd_suspended() */ - complete_conflicting_writes(req); + s = mdev->state; + remote = remote && drbd_should_do_remote(s); + send_oos = rw == WRITE && drbd_should_send_oos(s); + D_ASSERT(!(remote && send_oos)); + + if (!(local || remote) && !is_susp(mdev->state)) { + if (__ratelimit(&drbd_ratelimit_state)) + dev_err(DEV, "IO ERROR: neither local nor remote disk\n"); + goto fail_free_complete; } - /* no more giving up req_lock from now on! */ + /* For WRITE request, we have to make sure that we have an + * unused_spare_tle, in case we need to start a new epoch. + * I try to be smart and avoid to pre-allocate always "just in case", + * but there is a race between testing the bit and pointer outside the + * spinlock, and grabbing the spinlock. + * if we lost that race, we retry. */ + if (rw == WRITE && (remote || send_oos) && + mdev->unused_spare_tle == NULL && + test_bit(CREATE_BARRIER, &mdev->flags)) { +allocate_barrier: + b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_NOIO); + if (!b) { + dev_err(DEV, "Failed to alloc barrier.\n"); + err = -ENOMEM; + goto fail_free_complete; + } + } - if (drbd_suspended(mdev)) { - /* push back and retry: */ - req->rq_state |= RQ_POSTPONED; - if (req->private_bio) { - bio_put(req->private_bio); - req->private_bio = NULL; - put_ldev(mdev); + /* GOOD, everything prepared, grab the spin_lock */ + spin_lock_irq(&mdev->req_lock); + + if (is_susp(mdev->state)) { + /* If we got suspended, use the retry mechanism of + drbd_make_request() to restart processing of this + bio. In the next call to drbd_make_request + we sleep in inc_ap_bio() */ + ret = 1; + spin_unlock_irq(&mdev->req_lock); + goto fail_free_complete; + } + + if (remote || send_oos) { + remote = drbd_should_do_remote(mdev->state); + send_oos = rw == WRITE && drbd_should_send_oos(mdev->state); + D_ASSERT(!(remote && send_oos)); + + if (!(remote || send_oos)) + dev_warn(DEV, "lost connection while grabbing the req_lock!\n"); + if (!(local || remote)) { + dev_err(DEV, "IO ERROR: neither local nor remote disk\n"); + spin_unlock_irq(&mdev->req_lock); + goto fail_free_complete; } - goto out; } + if (b && mdev->unused_spare_tle == NULL) { + mdev->unused_spare_tle = b; + b = NULL; + } + if (rw == WRITE && (remote || send_oos) && + mdev->unused_spare_tle == NULL && + test_bit(CREATE_BARRIER, &mdev->flags)) { + /* someone closed the current epoch + * while we were grabbing the spinlock */ + spin_unlock_irq(&mdev->req_lock); + goto allocate_barrier; + } + + /* Update disk stats */ _drbd_start_io_acct(mdev, req, bio); - /* We fail READ/READA early, if we can not serve it. - * We must do this before req is registered on any lists. - * Otherwise, drbd_req_complete() will queue failed READ for retry. */ - if (rw != WRITE) { - if (!do_remote_read(req) && !req->private_bio) - goto nodata; + /* _maybe_start_new_epoch(mdev); + * If we need to generate a write barrier packet, we have to add the + * new epoch (barrier) object, and queue the barrier packet for sending, + * and queue the req's data after it _within the same lock_, otherwise + * we have race conditions were the reorder domains could be mixed up. + * + * Even read requests may start a new epoch and queue the corresponding + * barrier packet. To get the write ordering right, we only have to + * make sure that, if this is a write request and it triggered a + * barrier packet, this request is queued within the same spinlock. */ + if ((remote || send_oos) && mdev->unused_spare_tle && + test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) { + _tl_add_barrier(mdev, mdev->unused_spare_tle); + mdev->unused_spare_tle = NULL; + } else { + D_ASSERT(!(remote && rw == WRITE && + test_bit(CREATE_BARRIER, &mdev->flags))); } - /* which transfer log epoch does this belong to? */ - req->epoch = atomic_read(&mdev->tconn->current_tle_nr); + /* NOTE + * Actually, 'local' may be wrong here already, since we may have failed + * to write to the meta data, and may become wrong anytime because of + * local io-error for some other request, which would lead to us + * "detaching" the local disk. + * + * 'remote' may become wrong any time because the network could fail. + * + * This is a harmless race condition, though, since it is handled + * correctly at the appropriate places; so it just defers the failure + * of the respective operation. + */ + + /* mark them early for readability. + * this just sets some state flags. */ + if (remote) + _req_mod(req, to_be_send); + if (local) + _req_mod(req, to_be_submitted); + + /* check this request on the collision detection hash tables. + * if we have a conflict, just complete it here. + * THINK do we want to check reads, too? (I don't think so...) */ + if (rw == WRITE && _req_conflicts(req)) + goto fail_conflicting; /* no point in adding empty flushes to the transfer log, * they are mapped to drbd barriers already. */ - if (likely(req->i.size!=0)) { - if (rw == WRITE) - mdev->tconn->current_tle_writes++; + if (likely(size!=0)) + list_add_tail(&req->tl_requests, &mdev->newest_tle->requests); - list_add_tail(&req->tl_requests, &mdev->tconn->transfer_log); + /* NOTE remote first: to get the concurrent write detection right, + * we must register the request before start of local IO. */ + if (remote) { + /* either WRITE and C_CONNECTED, + * or READ, and no local disk, + * or READ, but not in sync. + */ + _req_mod(req, (rw == WRITE) + ? queue_for_net_write + : queue_for_net_read); } + if (send_oos && drbd_set_out_of_sync(mdev, sector, size)) + _req_mod(req, queue_for_send_oos); - if (rw == WRITE) { - if (!drbd_process_write_request(req)) - no_remote = true; - } else { - /* We either have a private_bio, or we can read from remote. - * Otherwise we had done the goto nodata above. */ - if (req->private_bio == NULL) { - _req_mod(req, TO_BE_SENT); - _req_mod(req, QUEUE_FOR_NET_READ); + if (remote && + mdev->net_conf->on_congestion != OC_BLOCK && mdev->agreed_pro_version >= 96) + maybe_pull_ahead(mdev); + + /* If this was a flush, queue a drbd barrier/start a new epoch. + * Unless the current epoch was empty anyways, or we are not currently + * replicating, in which case there is no point. */ + if (unlikely(bio->bi_rw & REQ_FLUSH) + && mdev->newest_tle->n_writes + && drbd_should_do_remote(mdev->state)) + queue_barrier(mdev); + + spin_unlock_irq(&mdev->req_lock); + kfree(b); /* if someone else has beaten us to it... */ + + if (local) { + req->private_bio->bi_bdev = mdev->ldev->backing_bdev; + + /* State may have changed since we grabbed our reference on the + * mdev->ldev member. Double check, and short-circuit to endio. + * In case the last activity log transaction failed to get on + * stable storage, and this is a WRITE, we may not even submit + * this bio. */ + if (get_ldev(mdev)) { + if (drbd_insert_fault(mdev, rw == WRITE ? DRBD_FAULT_DT_WR + : rw == READ ? DRBD_FAULT_DT_RD + : DRBD_FAULT_DT_RA)) + bio_endio(req->private_bio, -EIO); + else + generic_make_request(req->private_bio); + put_ldev(mdev); } else - no_remote = true; + bio_endio(req->private_bio, -EIO); } - if (req->private_bio) { - /* needs to be marked within the same spinlock */ - _req_mod(req, TO_BE_SUBMITTED); - /* but we need to give up the spinlock to submit */ - spin_unlock_irq(&mdev->tconn->req_lock); - drbd_submit_req_private_bio(req); - spin_lock_irq(&mdev->tconn->req_lock); - } else if (no_remote) { -nodata: - if (__ratelimit(&drbd_ratelimit_state)) - dev_err(DEV, "IO ERROR: neither local nor remote data, sector %llu+%u\n", - (unsigned long long)req->i.sector, req->i.size >> 9); - /* A write may have been queued for send_oos, however. - * So we can not simply free it, we must go through drbd_req_put_completion_ref() */ + return 0; + +fail_conflicting: + /* this is a conflicting request. + * even though it may have been only _partially_ + * overlapping with one of the currently pending requests, + * without even submitting or sending it, we will + * pretend that it was successfully served right now. + */ + _drbd_end_io_acct(mdev, req); + spin_unlock_irq(&mdev->req_lock); + if (remote) + dec_ap_pending(mdev); + /* THINK: do we want to fail it (-EIO), or pretend success? + * this pretends success. */ + err = 0; + +fail_free_complete: + if (req->rq_state & RQ_IN_ACT_LOG) + drbd_al_complete_io(mdev, sector); +fail_and_free_req: + if (local) { + bio_put(req->private_bio); + req->private_bio = NULL; + put_ldev(mdev); } + if (!ret) + bio_endio(bio, err); + + drbd_req_free(req); + dec_ap_bio(mdev); + kfree(b); + + return ret; +} -out: - if (drbd_req_put_completion_ref(req, &m, 1)) - kref_put(&req->kref, drbd_req_destroy); - spin_unlock_irq(&mdev->tconn->req_lock); +/* helper function for drbd_make_request + * if we can determine just by the mdev (state) that this request will fail, + * return 1 + * otherwise return 0 + */ +static int drbd_fail_request_early(struct drbd_conf *mdev, int is_write) +{ + if (mdev->state.role != R_PRIMARY && + (!allow_oos || is_write)) { + if (__ratelimit(&drbd_ratelimit_state)) { + dev_err(DEV, "Process %s[%u] tried to %s; " + "since we are not in Primary state, " + "we cannot allow this\n", + current->comm, current->pid, + is_write ? "WRITE" : "READ"); + } + return 1; + } - if (m.bio) - complete_master_bio(mdev, &m); - return; + return 0; } void drbd_make_request(struct request_queue *q, struct bio *bio) { + unsigned int s_enr, e_enr; struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata; unsigned long start_time; + if (drbd_fail_request_early(mdev, bio_data_dir(bio) & WRITE)) { + bio_endio(bio, -EPERM); + return; + } + start_time = jiffies; /* * what we "blindly" assume: */ - D_ASSERT(IS_ALIGNED(bio->bi_size, 512)); + D_ASSERT((bio->bi_size & 0x1ff) == 0); + + /* to make some things easier, force alignment of requests within the + * granularity of our hash tables */ + s_enr = bio->bi_sector >> HT_SHIFT; + e_enr = bio->bi_size ? (bio->bi_sector+(bio->bi_size>>9)-1) >> HT_SHIFT : s_enr; + + if (likely(s_enr == e_enr)) { + do { + inc_ap_bio(mdev, 1); + } while (drbd_make_request_common(mdev, bio, start_time)); + return; + } + + /* can this bio be split generically? + * Maybe add our own split-arbitrary-bios function. */ + if (bio->bi_vcnt != 1 || bio->bi_idx != 0 || bio->bi_size > DRBD_MAX_BIO_SIZE) { + /* rather error out here than BUG in bio_split */ + dev_err(DEV, "bio would need to, but cannot, be split: " + "(vcnt=%u,idx=%u,size=%u,sector=%llu)\n", + bio->bi_vcnt, bio->bi_idx, bio->bi_size, + (unsigned long long)bio->bi_sector); + bio_endio(bio, -EINVAL); + } else { + /* This bio crosses some boundary, so we have to split it. */ + struct bio_pair *bp; + /* works for the "do not cross hash slot boundaries" case + * e.g. sector 262269, size 4096 + * s_enr = 262269 >> 6 = 4097 + * e_enr = (262269+8-1) >> 6 = 4098 + * HT_SHIFT = 6 + * sps = 64, mask = 63 + * first_sectors = 64 - (262269 & 63) = 3 + */ + const sector_t sect = bio->bi_sector; + const int sps = 1 << HT_SHIFT; /* sectors per slot */ + const int mask = sps - 1; + const sector_t first_sectors = sps - (sect & mask); + bp = bio_split(bio, first_sectors); - inc_ap_bio(mdev); - __drbd_make_request(mdev, bio, start_time); + /* we need to get a "reference count" (ap_bio_cnt) + * to avoid races with the disconnect/reconnect/suspend code. + * In case we need to split the bio here, we need to get three references + * atomically, otherwise we might deadlock when trying to submit the + * second one! */ + inc_ap_bio(mdev, 3); + + D_ASSERT(e_enr == s_enr + 1); + + while (drbd_make_request_common(mdev, &bp->bio1, start_time)) + inc_ap_bio(mdev, 1); + + while (drbd_make_request_common(mdev, &bp->bio2, start_time)) + inc_ap_bio(mdev, 1); + + dec_ap_bio(mdev); + + bio_pair_release(bp); + } } -/* This is called by bio_add_page(). - * - * q->max_hw_sectors and other global limits are already enforced there. +/* This is called by bio_add_page(). With this function we reduce + * the number of BIOs that span over multiple DRBD_MAX_BIO_SIZEs + * units (was AL_EXTENTs). * - * We need to call down to our lower level device, - * in case it has special restrictions. - * - * We also may need to enforce configured max-bio-bvecs limits. + * we do the calculation within the lower 32bit of the byte offsets, + * since we don't care for actual offset, but only check whether it + * would cross "activity log extent" boundaries. * * As long as the BIO is empty we have to allow at least one bvec, - * regardless of size and offset, so no need to ask lower levels. + * regardless of size and offset. so the resulting bio may still + * cross extent boundaries. those are dealt with (bio_split) in + * drbd_make_request. */ int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec) { struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata; + unsigned int bio_offset = + (unsigned int)bvm->bi_sector << 9; /* 32 bit */ unsigned int bio_size = bvm->bi_size; - int limit = DRBD_MAX_BIO_SIZE; - int backing_limit; - - if (bio_size && get_ldev(mdev)) { + int limit, backing_limit; + + limit = DRBD_MAX_BIO_SIZE + - ((bio_offset & (DRBD_MAX_BIO_SIZE-1)) + bio_size); + if (limit < 0) + limit = 0; + if (bio_size == 0) { + if (limit <= bvec->bv_len) + limit = bvec->bv_len; + } else if (limit && get_ldev(mdev)) { struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue; if (b->merge_bvec_fn) { @@ -1187,38 +1240,24 @@ int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct return limit; } -struct drbd_request *find_oldest_request(struct drbd_tconn *tconn) -{ - /* Walk the transfer log, - * and find the oldest not yet completed request */ - struct drbd_request *r; - list_for_each_entry(r, &tconn->transfer_log, tl_requests) { - if (atomic_read(&r->completion_ref)) - return r; - } - return NULL; -} - void request_timer_fn(unsigned long data) { struct drbd_conf *mdev = (struct drbd_conf *) data; - struct drbd_tconn *tconn = mdev->tconn; struct drbd_request *req; /* oldest request */ - struct net_conf *nc; + struct list_head *le; unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */ unsigned long now; - rcu_read_lock(); - nc = rcu_dereference(tconn->net_conf); - if (nc && mdev->state.conn >= C_WF_REPORT_PARAMS) - ent = nc->timeout * HZ/10 * nc->ko_count; - + if (get_net_conf(mdev)) { + if (mdev->state.conn >= C_WF_REPORT_PARAMS) + ent = mdev->net_conf->timeout*HZ/10 + * mdev->net_conf->ko_count; + put_net_conf(mdev); + } if (get_ldev(mdev)) { /* implicit state.disk >= D_INCONSISTENT */ - dt = rcu_dereference(mdev->ldev->disk_conf)->disk_timeout * HZ / 10; + dt = mdev->ldev->dc.disk_timeout * HZ / 10; put_ldev(mdev); } - rcu_read_unlock(); - et = min_not_zero(dt, ent); if (!et) @@ -1226,14 +1265,17 @@ void request_timer_fn(unsigned long data) now = jiffies; - spin_lock_irq(&tconn->req_lock); - req = find_oldest_request(tconn); - if (!req) { - spin_unlock_irq(&tconn->req_lock); + spin_lock_irq(&mdev->req_lock); + le = &mdev->oldest_tle->requests; + if (list_empty(le)) { + spin_unlock_irq(&mdev->req_lock); mod_timer(&mdev->request_timer, now + et); return; } + le = le->prev; + req = list_entry(le, struct drbd_request, tl_requests); + /* The request is considered timed out, if * - we have some effective timeout from the configuration, * with above state restrictions applied, @@ -1252,17 +1294,17 @@ void request_timer_fn(unsigned long data) */ if (ent && req->rq_state & RQ_NET_PENDING && time_after(now, req->start_time + ent) && - !time_in_range(now, tconn->last_reconnect_jif, tconn->last_reconnect_jif + ent)) { + !time_in_range(now, mdev->last_reconnect_jif, mdev->last_reconnect_jif + ent)) { dev_warn(DEV, "Remote failed to finish a request within ko-count * timeout\n"); _drbd_set_state(_NS(mdev, conn, C_TIMEOUT), CS_VERBOSE | CS_HARD, NULL); } - if (dt && req->rq_state & RQ_LOCAL_PENDING && req->w.mdev == mdev && + if (dt && req->rq_state & RQ_LOCAL_PENDING && time_after(now, req->start_time + dt) && !time_in_range(now, mdev->last_reattach_jif, mdev->last_reattach_jif + dt)) { dev_warn(DEV, "Local backing device failed to meet the disk-timeout\n"); __drbd_chk_io_error(mdev, DRBD_FORCE_DETACH); } nt = (time_after(now, req->start_time + et) ? now : req->start_time) + et; - spin_unlock_irq(&tconn->req_lock); + spin_unlock_irq(&mdev->req_lock); mod_timer(&mdev->request_timer, nt); } diff --git a/trunk/drivers/block/drbd/drbd_req.h b/trunk/drivers/block/drbd/drbd_req.h index 016de6b8bb57..3d2111919486 100644 --- a/trunk/drivers/block/drbd/drbd_req.h +++ b/trunk/drivers/block/drbd/drbd_req.h @@ -77,41 +77,40 @@ */ enum drbd_req_event { - CREATED, - TO_BE_SENT, - TO_BE_SUBMITTED, + created, + to_be_send, + to_be_submitted, /* XXX yes, now I am inconsistent... * these are not "events" but "actions" * oh, well... */ - QUEUE_FOR_NET_WRITE, - QUEUE_FOR_NET_READ, - QUEUE_FOR_SEND_OOS, - - SEND_CANCELED, - SEND_FAILED, - HANDED_OVER_TO_NETWORK, - OOS_HANDED_TO_NETWORK, - CONNECTION_LOST_WHILE_PENDING, - READ_RETRY_REMOTE_CANCELED, - RECV_ACKED_BY_PEER, - WRITE_ACKED_BY_PEER, - WRITE_ACKED_BY_PEER_AND_SIS, /* and set_in_sync */ - CONFLICT_RESOLVED, - POSTPONE_WRITE, - NEG_ACKED, - BARRIER_ACKED, /* in protocol A and B */ - DATA_RECEIVED, /* (remote read) */ - - READ_COMPLETED_WITH_ERROR, - READ_AHEAD_COMPLETED_WITH_ERROR, - WRITE_COMPLETED_WITH_ERROR, - ABORT_DISK_IO, - COMPLETED_OK, - RESEND, - FAIL_FROZEN_DISK_IO, - RESTART_FROZEN_DISK_IO, - NOTHING, + queue_for_net_write, + queue_for_net_read, + queue_for_send_oos, + + send_canceled, + send_failed, + handed_over_to_network, + oos_handed_to_network, + connection_lost_while_pending, + read_retry_remote_canceled, + recv_acked_by_peer, + write_acked_by_peer, + write_acked_by_peer_and_sis, /* and set_in_sync */ + conflict_discarded_by_peer, + neg_acked, + barrier_acked, /* in protocol A and B */ + data_received, /* (remote read) */ + + read_completed_with_error, + read_ahead_completed_with_error, + write_completed_with_error, + abort_disk_io, + completed_ok, + resend, + fail_frozen_disk_io, + restart_frozen_disk_io, + nothing, /* for tracing only */ }; /* encoding of request states for now. we don't actually need that many bits. @@ -143,8 +142,8 @@ enum drbd_req_state_bits { * recv_ack (B) or implicit "ack" (A), * still waiting for the barrier ack. * master_bio may already be completed and invalidated. - * 11100: write acked (C), - * data received (for remote read, any protocol) + * 11100: write_acked (C), + * data_received (for remote read, any protocol) * or finally the barrier ack has arrived (B,A)... * request can be freed * 01100: neg-acked (write, protocol C) @@ -199,22 +198,6 @@ enum drbd_req_state_bits { /* Should call drbd_al_complete_io() for this request... */ __RQ_IN_ACT_LOG, - - /* The peer has sent a retry ACK */ - __RQ_POSTPONED, - - /* would have been completed, - * but was not, because of drbd_suspended() */ - __RQ_COMPLETION_SUSP, - - /* We expect a receive ACK (wire proto B) */ - __RQ_EXP_RECEIVE_ACK, - - /* We expect a write ACK (wite proto C) */ - __RQ_EXP_WRITE_ACK, - - /* waiting for a barrier ack, did an extra kref_get */ - __RQ_EXP_BARR_ACK, }; #define RQ_LOCAL_PENDING (1UL << __RQ_LOCAL_PENDING) @@ -236,16 +219,56 @@ enum drbd_req_state_bits { #define RQ_WRITE (1UL << __RQ_WRITE) #define RQ_IN_ACT_LOG (1UL << __RQ_IN_ACT_LOG) -#define RQ_POSTPONED (1UL << __RQ_POSTPONED) -#define RQ_COMPLETION_SUSP (1UL << __RQ_COMPLETION_SUSP) -#define RQ_EXP_RECEIVE_ACK (1UL << __RQ_EXP_RECEIVE_ACK) -#define RQ_EXP_WRITE_ACK (1UL << __RQ_EXP_WRITE_ACK) -#define RQ_EXP_BARR_ACK (1UL << __RQ_EXP_BARR_ACK) /* For waking up the frozen transfer log mod_req() has to return if the request should be counted in the epoch object*/ -#define MR_WRITE 1 -#define MR_READ 2 +#define MR_WRITE_SHIFT 0 +#define MR_WRITE (1 << MR_WRITE_SHIFT) +#define MR_READ_SHIFT 1 +#define MR_READ (1 << MR_READ_SHIFT) + +/* epoch entries */ +static inline +struct hlist_head *ee_hash_slot(struct drbd_conf *mdev, sector_t sector) +{ + BUG_ON(mdev->ee_hash_s == 0); + return mdev->ee_hash + + ((unsigned int)(sector>>HT_SHIFT) % mdev->ee_hash_s); +} + +/* transfer log (drbd_request objects) */ +static inline +struct hlist_head *tl_hash_slot(struct drbd_conf *mdev, sector_t sector) +{ + BUG_ON(mdev->tl_hash_s == 0); + return mdev->tl_hash + + ((unsigned int)(sector>>HT_SHIFT) % mdev->tl_hash_s); +} + +/* application reads (drbd_request objects) */ +static struct hlist_head *ar_hash_slot(struct drbd_conf *mdev, sector_t sector) +{ + return mdev->app_reads_hash + + ((unsigned int)(sector) % APP_R_HSIZE); +} + +/* when we receive the answer for a read request, + * verify that we actually know about it */ +static inline struct drbd_request *_ar_id_to_req(struct drbd_conf *mdev, + u64 id, sector_t sector) +{ + struct hlist_head *slot = ar_hash_slot(mdev, sector); + struct hlist_node *n; + struct drbd_request *req; + + hlist_for_each_entry(req, n, slot, collision) { + if ((unsigned long)req == (unsigned long)id) { + D_ASSERT(req->sector == sector); + return req; + } + } + return NULL; +} static inline void drbd_req_make_private_bio(struct drbd_request *req, struct bio *bio_src) { @@ -255,10 +278,41 @@ static inline void drbd_req_make_private_bio(struct drbd_request *req, struct bi req->private_bio = bio; bio->bi_private = req; - bio->bi_end_io = drbd_request_endio; + bio->bi_end_io = drbd_endio_pri; bio->bi_next = NULL; } +static inline struct drbd_request *drbd_req_new(struct drbd_conf *mdev, + struct bio *bio_src) +{ + struct drbd_request *req = + mempool_alloc(drbd_request_mempool, GFP_NOIO); + if (likely(req)) { + drbd_req_make_private_bio(req, bio_src); + + req->rq_state = bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0; + req->mdev = mdev; + req->master_bio = bio_src; + req->epoch = 0; + req->sector = bio_src->bi_sector; + req->size = bio_src->bi_size; + INIT_HLIST_NODE(&req->collision); + INIT_LIST_HEAD(&req->tl_requests); + INIT_LIST_HEAD(&req->w.list); + } + return req; +} + +static inline void drbd_req_free(struct drbd_request *req) +{ + mempool_free(req, drbd_request_mempool); +} + +static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2) +{ + return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9))); +} + /* Short lived temporary struct on the stack. * We could squirrel the error to be returned into * bio->bi_size, or similar. But that would be too ugly. */ @@ -267,7 +321,6 @@ struct bio_and_error { int error; }; -extern void drbd_req_destroy(struct kref *kref); extern void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m); extern int __req_mod(struct drbd_request *req, enum drbd_req_event what, @@ -275,17 +328,13 @@ extern int __req_mod(struct drbd_request *req, enum drbd_req_event what, extern void complete_master_bio(struct drbd_conf *mdev, struct bio_and_error *m); extern void request_timer_fn(unsigned long data); -extern void tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what); -extern void _tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what); - -/* this is in drbd_main.c */ -extern void drbd_restart_request(struct drbd_request *req); +extern void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what); /* use this if you don't want to deal with calling complete_master_bio() * outside the spinlock, e.g. when walking some list on cleanup. */ static inline int _req_mod(struct drbd_request *req, enum drbd_req_event what) { - struct drbd_conf *mdev = req->w.mdev; + struct drbd_conf *mdev = req->mdev; struct bio_and_error m; int rv; @@ -305,13 +354,13 @@ static inline int req_mod(struct drbd_request *req, enum drbd_req_event what) { unsigned long flags; - struct drbd_conf *mdev = req->w.mdev; + struct drbd_conf *mdev = req->mdev; struct bio_and_error m; int rv; - spin_lock_irqsave(&mdev->tconn->req_lock, flags); + spin_lock_irqsave(&mdev->req_lock, flags); rv = __req_mod(req, what, &m); - spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); + spin_unlock_irqrestore(&mdev->req_lock, flags); if (m.bio) complete_master_bio(mdev, &m); @@ -319,7 +368,7 @@ static inline int req_mod(struct drbd_request *req, return rv; } -static inline bool drbd_should_do_remote(union drbd_dev_state s) +static inline bool drbd_should_do_remote(union drbd_state s) { return s.pdsk == D_UP_TO_DATE || (s.pdsk >= D_INCONSISTENT && @@ -329,7 +378,7 @@ static inline bool drbd_should_do_remote(union drbd_dev_state s) That is equivalent since before 96 IO was frozen in the C_WF_BITMAP* states. */ } -static inline bool drbd_should_send_out_of_sync(union drbd_dev_state s) +static inline bool drbd_should_send_oos(union drbd_state s) { return s.conn == C_AHEAD || s.conn == C_WF_BITMAP_S; /* pdsk = D_INCONSISTENT as a consequence. Protocol 96 check not necessary diff --git a/trunk/drivers/block/drbd/drbd_state.c b/trunk/drivers/block/drbd/drbd_state.c deleted file mode 100644 index 53bf6182bac4..000000000000 --- a/trunk/drivers/block/drbd/drbd_state.c +++ /dev/null @@ -1,1856 +0,0 @@ -/* - drbd_state.c - - This file is part of DRBD by Philipp Reisner and Lars Ellenberg. - - Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. - Copyright (C) 1999-2008, Philipp Reisner . - Copyright (C) 2002-2008, Lars Ellenberg . - - Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev - from Logicworks, Inc. for making SDP replication support possible. - - drbd is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2, or (at your option) - any later version. - - drbd is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with drbd; see the file COPYING. If not, write to - the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#include -#include "drbd_int.h" -#include "drbd_req.h" - -/* in drbd_main.c */ -extern void tl_abort_disk_io(struct drbd_conf *mdev); - -struct after_state_chg_work { - struct drbd_work w; - union drbd_state os; - union drbd_state ns; - enum chg_state_flags flags; - struct completion *done; -}; - -enum sanitize_state_warnings { - NO_WARNING, - ABORTED_ONLINE_VERIFY, - ABORTED_RESYNC, - CONNECTION_LOST_NEGOTIATING, - IMPLICITLY_UPGRADED_DISK, - IMPLICITLY_UPGRADED_PDSK, -}; - -static int w_after_state_ch(struct drbd_work *w, int unused); -static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, - union drbd_state ns, enum chg_state_flags flags); -static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state); -static enum drbd_state_rv is_valid_soft_transition(union drbd_state, union drbd_state, struct drbd_tconn *); -static enum drbd_state_rv is_valid_transition(union drbd_state os, union drbd_state ns); -static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state ns, - enum sanitize_state_warnings *warn); - -static inline bool is_susp(union drbd_state s) -{ - return s.susp || s.susp_nod || s.susp_fen; -} - -bool conn_all_vols_unconf(struct drbd_tconn *tconn) -{ - struct drbd_conf *mdev; - bool rv = true; - int vnr; - - rcu_read_lock(); - idr_for_each_entry(&tconn->volumes, mdev, vnr) { - if (mdev->state.disk != D_DISKLESS || - mdev->state.conn != C_STANDALONE || - mdev->state.role != R_SECONDARY) { - rv = false; - break; - } - } - rcu_read_unlock(); - - return rv; -} - -/* Unfortunately the states where not correctly ordered, when - they where defined. therefore can not use max_t() here. */ -static enum drbd_role max_role(enum drbd_role role1, enum drbd_role role2) -{ - if (role1 == R_PRIMARY || role2 == R_PRIMARY) - return R_PRIMARY; - if (role1 == R_SECONDARY || role2 == R_SECONDARY) - return R_SECONDARY; - return R_UNKNOWN; -} -static enum drbd_role min_role(enum drbd_role role1, enum drbd_role role2) -{ - if (role1 == R_UNKNOWN || role2 == R_UNKNOWN) - return R_UNKNOWN; - if (role1 == R_SECONDARY || role2 == R_SECONDARY) - return R_SECONDARY; - return R_PRIMARY; -} - -enum drbd_role conn_highest_role(struct drbd_tconn *tconn) -{ - enum drbd_role role = R_UNKNOWN; - struct drbd_conf *mdev; - int vnr; - - rcu_read_lock(); - idr_for_each_entry(&tconn->volumes, mdev, vnr) - role = max_role(role, mdev->state.role); - rcu_read_unlock(); - - return role; -} - -enum drbd_role conn_highest_peer(struct drbd_tconn *tconn) -{ - enum drbd_role peer = R_UNKNOWN; - struct drbd_conf *mdev; - int vnr; - - rcu_read_lock(); - idr_for_each_entry(&tconn->volumes, mdev, vnr) - peer = max_role(peer, mdev->state.peer); - rcu_read_unlock(); - - return peer; -} - -enum drbd_disk_state conn_highest_disk(struct drbd_tconn *tconn) -{ - enum drbd_disk_state ds = D_DISKLESS; - struct drbd_conf *mdev; - int vnr; - - rcu_read_lock(); - idr_for_each_entry(&tconn->volumes, mdev, vnr) - ds = max_t(enum drbd_disk_state, ds, mdev->state.disk); - rcu_read_unlock(); - - return ds; -} - -enum drbd_disk_state conn_lowest_disk(struct drbd_tconn *tconn) -{ - enum drbd_disk_state ds = D_MASK; - struct drbd_conf *mdev; - int vnr; - - rcu_read_lock(); - idr_for_each_entry(&tconn->volumes, mdev, vnr) - ds = min_t(enum drbd_disk_state, ds, mdev->state.disk); - rcu_read_unlock(); - - return ds; -} - -enum drbd_disk_state conn_highest_pdsk(struct drbd_tconn *tconn) -{ - enum drbd_disk_state ds = D_DISKLESS; - struct drbd_conf *mdev; - int vnr; - - rcu_read_lock(); - idr_for_each_entry(&tconn->volumes, mdev, vnr) - ds = max_t(enum drbd_disk_state, ds, mdev->state.pdsk); - rcu_read_unlock(); - - return ds; -} - -enum drbd_conns conn_lowest_conn(struct drbd_tconn *tconn) -{ - enum drbd_conns conn = C_MASK; - struct drbd_conf *mdev; - int vnr; - - rcu_read_lock(); - idr_for_each_entry(&tconn->volumes, mdev, vnr) - conn = min_t(enum drbd_conns, conn, mdev->state.conn); - rcu_read_unlock(); - - return conn; -} - -static bool no_peer_wf_report_params(struct drbd_tconn *tconn) -{ - struct drbd_conf *mdev; - int vnr; - bool rv = true; - - rcu_read_lock(); - idr_for_each_entry(&tconn->volumes, mdev, vnr) - if (mdev->state.conn == C_WF_REPORT_PARAMS) { - rv = false; - break; - } - rcu_read_unlock(); - - return rv; -} - - -/** - * cl_wide_st_chg() - true if the state change is a cluster wide one - * @mdev: DRBD device. - * @os: old (current) state. - * @ns: new (wanted) state. - */ -static int cl_wide_st_chg(struct drbd_conf *mdev, - union drbd_state os, union drbd_state ns) -{ - return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED && - ((os.role != R_PRIMARY && ns.role == R_PRIMARY) || - (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || - (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) || - (os.disk != D_FAILED && ns.disk == D_FAILED))) || - (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) || - (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S) || - (os.conn == C_CONNECTED && ns.conn == C_WF_REPORT_PARAMS); -} - -static union drbd_state -apply_mask_val(union drbd_state os, union drbd_state mask, union drbd_state val) -{ - union drbd_state ns; - ns.i = (os.i & ~mask.i) | val.i; - return ns; -} - -enum drbd_state_rv -drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f, - union drbd_state mask, union drbd_state val) -{ - unsigned long flags; - union drbd_state ns; - enum drbd_state_rv rv; - - spin_lock_irqsave(&mdev->tconn->req_lock, flags); - ns = apply_mask_val(drbd_read_state(mdev), mask, val); - rv = _drbd_set_state(mdev, ns, f, NULL); - spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); - - return rv; -} - -/** - * drbd_force_state() - Impose a change which happens outside our control on our state - * @mdev: DRBD device. - * @mask: mask of state bits to change. - * @val: value of new state bits. - */ -void drbd_force_state(struct drbd_conf *mdev, - union drbd_state mask, union drbd_state val) -{ - drbd_change_state(mdev, CS_HARD, mask, val); -} - -static enum drbd_state_rv -_req_st_cond(struct drbd_conf *mdev, union drbd_state mask, - union drbd_state val) -{ - union drbd_state os, ns; - unsigned long flags; - enum drbd_state_rv rv; - - if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags)) - return SS_CW_SUCCESS; - - if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags)) - return SS_CW_FAILED_BY_PEER; - - spin_lock_irqsave(&mdev->tconn->req_lock, flags); - os = drbd_read_state(mdev); - ns = sanitize_state(mdev, apply_mask_val(os, mask, val), NULL); - rv = is_valid_transition(os, ns); - if (rv >= SS_SUCCESS) - rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */ - - if (!cl_wide_st_chg(mdev, os, ns)) - rv = SS_CW_NO_NEED; - if (rv == SS_UNKNOWN_ERROR) { - rv = is_valid_state(mdev, ns); - if (rv >= SS_SUCCESS) { - rv = is_valid_soft_transition(os, ns, mdev->tconn); - if (rv >= SS_SUCCESS) - rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */ - } - } - spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); - - return rv; -} - -/** - * drbd_req_state() - Perform an eventually cluster wide state change - * @mdev: DRBD device. - * @mask: mask of state bits to change. - * @val: value of new state bits. - * @f: flags - * - * Should not be called directly, use drbd_request_state() or - * _drbd_request_state(). - */ -static enum drbd_state_rv -drbd_req_state(struct drbd_conf *mdev, union drbd_state mask, - union drbd_state val, enum chg_state_flags f) -{ - struct completion done; - unsigned long flags; - union drbd_state os, ns; - enum drbd_state_rv rv; - - init_completion(&done); - - if (f & CS_SERIALIZE) - mutex_lock(mdev->state_mutex); - - spin_lock_irqsave(&mdev->tconn->req_lock, flags); - os = drbd_read_state(mdev); - ns = sanitize_state(mdev, apply_mask_val(os, mask, val), NULL); - rv = is_valid_transition(os, ns); - if (rv < SS_SUCCESS) { - spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); - goto abort; - } - - if (cl_wide_st_chg(mdev, os, ns)) { - rv = is_valid_state(mdev, ns); - if (rv == SS_SUCCESS) - rv = is_valid_soft_transition(os, ns, mdev->tconn); - spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); - - if (rv < SS_SUCCESS) { - if (f & CS_VERBOSE) - print_st_err(mdev, os, ns, rv); - goto abort; - } - - if (drbd_send_state_req(mdev, mask, val)) { - rv = SS_CW_FAILED_BY_PEER; - if (f & CS_VERBOSE) - print_st_err(mdev, os, ns, rv); - goto abort; - } - - wait_event(mdev->state_wait, - (rv = _req_st_cond(mdev, mask, val))); - - if (rv < SS_SUCCESS) { - if (f & CS_VERBOSE) - print_st_err(mdev, os, ns, rv); - goto abort; - } - spin_lock_irqsave(&mdev->tconn->req_lock, flags); - ns = apply_mask_val(drbd_read_state(mdev), mask, val); - rv = _drbd_set_state(mdev, ns, f, &done); - } else { - rv = _drbd_set_state(mdev, ns, f, &done); - } - - spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); - - if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) { - D_ASSERT(current != mdev->tconn->worker.task); - wait_for_completion(&done); - } - -abort: - if (f & CS_SERIALIZE) - mutex_unlock(mdev->state_mutex); - - return rv; -} - -/** - * _drbd_request_state() - Request a state change (with flags) - * @mdev: DRBD device. - * @mask: mask of state bits to change. - * @val: value of new state bits. - * @f: flags - * - * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE - * flag, or when logging of failed state change requests is not desired. - */ -enum drbd_state_rv -_drbd_request_state(struct drbd_conf *mdev, union drbd_state mask, - union drbd_state val, enum chg_state_flags f) -{ - enum drbd_state_rv rv; - - wait_event(mdev->state_wait, - (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE); - - return rv; -} - -static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns) -{ - dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c%c%c }\n", - name, - drbd_conn_str(ns.conn), - drbd_role_str(ns.role), - drbd_role_str(ns.peer), - drbd_disk_str(ns.disk), - drbd_disk_str(ns.pdsk), - is_susp(ns) ? 's' : 'r', - ns.aftr_isp ? 'a' : '-', - ns.peer_isp ? 'p' : '-', - ns.user_isp ? 'u' : '-', - ns.susp_fen ? 'F' : '-', - ns.susp_nod ? 'N' : '-' - ); -} - -void print_st_err(struct drbd_conf *mdev, union drbd_state os, - union drbd_state ns, enum drbd_state_rv err) -{ - if (err == SS_IN_TRANSIENT_STATE) - return; - dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err)); - print_st(mdev, " state", os); - print_st(mdev, "wanted", ns); -} - -static long print_state_change(char *pb, union drbd_state os, union drbd_state ns, - enum chg_state_flags flags) -{ - char *pbp; - pbp = pb; - *pbp = 0; - - if (ns.role != os.role && flags & CS_DC_ROLE) - pbp += sprintf(pbp, "role( %s -> %s ) ", - drbd_role_str(os.role), - drbd_role_str(ns.role)); - if (ns.peer != os.peer && flags & CS_DC_PEER) - pbp += sprintf(pbp, "peer( %s -> %s ) ", - drbd_role_str(os.peer), - drbd_role_str(ns.peer)); - if (ns.conn != os.conn && flags & CS_DC_CONN) - pbp += sprintf(pbp, "conn( %s -> %s ) ", - drbd_conn_str(os.conn), - drbd_conn_str(ns.conn)); - if (ns.disk != os.disk && flags & CS_DC_DISK) - pbp += sprintf(pbp, "disk( %s -> %s ) ", - drbd_disk_str(os.disk), - drbd_disk_str(ns.disk)); - if (ns.pdsk != os.pdsk && flags & CS_DC_PDSK) - pbp += sprintf(pbp, "pdsk( %s -> %s ) ", - drbd_disk_str(os.pdsk), - drbd_disk_str(ns.pdsk)); - - return pbp - pb; -} - -static void drbd_pr_state_change(struct drbd_conf *mdev, union drbd_state os, union drbd_state ns, - enum chg_state_flags flags) -{ - char pb[300]; - char *pbp = pb; - - pbp += print_state_change(pbp, os, ns, flags ^ CS_DC_MASK); - - if (ns.aftr_isp != os.aftr_isp) - pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ", - os.aftr_isp, - ns.aftr_isp); - if (ns.peer_isp != os.peer_isp) - pbp += sprintf(pbp, "peer_isp( %d -> %d ) ", - os.peer_isp, - ns.peer_isp); - if (ns.user_isp != os.user_isp) - pbp += sprintf(pbp, "user_isp( %d -> %d ) ", - os.user_isp, - ns.user_isp); - - if (pbp != pb) - dev_info(DEV, "%s\n", pb); -} - -static void conn_pr_state_change(struct drbd_tconn *tconn, union drbd_state os, union drbd_state ns, - enum chg_state_flags flags) -{ - char pb[300]; - char *pbp = pb; - - pbp += print_state_change(pbp, os, ns, flags); - - if (is_susp(ns) != is_susp(os) && flags & CS_DC_SUSP) - pbp += sprintf(pbp, "susp( %d -> %d ) ", - is_susp(os), - is_susp(ns)); - - if (pbp != pb) - conn_info(tconn, "%s\n", pb); -} - - -/** - * is_valid_state() - Returns an SS_ error code if ns is not valid - * @mdev: DRBD device. - * @ns: State to consider. - */ -static enum drbd_state_rv -is_valid_state(struct drbd_conf *mdev, union drbd_state ns) -{ - /* See drbd_state_sw_errors in drbd_strings.c */ - - enum drbd_fencing_p fp; - enum drbd_state_rv rv = SS_SUCCESS; - struct net_conf *nc; - - rcu_read_lock(); - fp = FP_DONT_CARE; - if (get_ldev(mdev)) { - fp = rcu_dereference(mdev->ldev->disk_conf)->fencing; - put_ldev(mdev); - } - - nc = rcu_dereference(mdev->tconn->net_conf); - if (nc) { - if (!nc->two_primaries && ns.role == R_PRIMARY) { - if (ns.peer == R_PRIMARY) - rv = SS_TWO_PRIMARIES; - else if (conn_highest_peer(mdev->tconn) == R_PRIMARY) - rv = SS_O_VOL_PEER_PRI; - } - } - - if (rv <= 0) - /* already found a reason to abort */; - else if (ns.role == R_SECONDARY && mdev->open_cnt) - rv = SS_DEVICE_IN_USE; - - else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE) - rv = SS_NO_UP_TO_DATE_DISK; - - else if (fp >= FP_RESOURCE && - ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN) - rv = SS_PRIMARY_NOP; - - else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT) - rv = SS_NO_UP_TO_DATE_DISK; - - else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT) - rv = SS_NO_LOCAL_DISK; - - else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT) - rv = SS_NO_REMOTE_DISK; - - else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) - rv = SS_NO_UP_TO_DATE_DISK; - - else if ((ns.conn == C_CONNECTED || - ns.conn == C_WF_BITMAP_S || - ns.conn == C_SYNC_SOURCE || - ns.conn == C_PAUSED_SYNC_S) && - ns.disk == D_OUTDATED) - rv = SS_CONNECTED_OUTDATES; - - else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && - (nc->verify_alg[0] == 0)) - rv = SS_NO_VERIFY_ALG; - - else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && - mdev->tconn->agreed_pro_version < 88) - rv = SS_NOT_SUPPORTED; - - else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN) - rv = SS_CONNECTED_OUTDATES; - - rcu_read_unlock(); - - return rv; -} - -/** - * is_valid_soft_transition() - Returns an SS_ error code if the state transition is not possible - * This function limits state transitions that may be declined by DRBD. I.e. - * user requests (aka soft transitions). - * @mdev: DRBD device. - * @ns: new state. - * @os: old state. - */ -static enum drbd_state_rv -is_valid_soft_transition(union drbd_state os, union drbd_state ns, struct drbd_tconn *tconn) -{ - enum drbd_state_rv rv = SS_SUCCESS; - - if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) && - os.conn > C_CONNECTED) - rv = SS_RESYNC_RUNNING; - - if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE) - rv = SS_ALREADY_STANDALONE; - - if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS) - rv = SS_IS_DISKLESS; - - if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED) - rv = SS_NO_NET_CONFIG; - - if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING) - rv = SS_LOWER_THAN_OUTDATED; - - if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED) - rv = SS_IN_TRANSIENT_STATE; - - /* if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS) - rv = SS_IN_TRANSIENT_STATE; */ - - /* While establishing a connection only allow cstate to change. - Delay/refuse role changes, detach attach etc... */ - if (test_bit(STATE_SENT, &tconn->flags) && - !(os.conn == C_WF_REPORT_PARAMS || - (ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION))) - rv = SS_IN_TRANSIENT_STATE; - - if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED) - rv = SS_NEED_CONNECTION; - - if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && - ns.conn != os.conn && os.conn > C_CONNECTED) - rv = SS_RESYNC_RUNNING; - - if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) && - os.conn < C_CONNECTED) - rv = SS_NEED_CONNECTION; - - if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE) - && os.conn < C_WF_REPORT_PARAMS) - rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */ - - return rv; -} - -static enum drbd_state_rv -is_valid_conn_transition(enum drbd_conns oc, enum drbd_conns nc) -{ - /* no change -> nothing to do, at least for the connection part */ - if (oc == nc) - return SS_NOTHING_TO_DO; - - /* disconnect of an unconfigured connection does not make sense */ - if (oc == C_STANDALONE && nc == C_DISCONNECTING) - return SS_ALREADY_STANDALONE; - - /* from C_STANDALONE, we start with C_UNCONNECTED */ - if (oc == C_STANDALONE && nc != C_UNCONNECTED) - return SS_NEED_CONNECTION; - - /* When establishing a connection we need to go through WF_REPORT_PARAMS! - Necessary to do the right thing upon invalidate-remote on a disconnected resource */ - if (oc < C_WF_REPORT_PARAMS && nc >= C_CONNECTED) - return SS_NEED_CONNECTION; - - /* After a network error only C_UNCONNECTED or C_DISCONNECTING may follow. */ - if (oc >= C_TIMEOUT && oc <= C_TEAR_DOWN && nc != C_UNCONNECTED && nc != C_DISCONNECTING) - return SS_IN_TRANSIENT_STATE; - - /* After C_DISCONNECTING only C_STANDALONE may follow */ - if (oc == C_DISCONNECTING && nc != C_STANDALONE) - return SS_IN_TRANSIENT_STATE; - - return SS_SUCCESS; -} - - -/** - * is_valid_transition() - Returns an SS_ error code if the state transition is not possible - * This limits hard state transitions. Hard state transitions are facts there are - * imposed on DRBD by the environment. E.g. disk broke or network broke down. - * But those hard state transitions are still not allowed to do everything. - * @ns: new state. - * @os: old state. - */ -static enum drbd_state_rv -is_valid_transition(union drbd_state os, union drbd_state ns) -{ - enum drbd_state_rv rv; - - rv = is_valid_conn_transition(os.conn, ns.conn); - - /* we cannot fail (again) if we already detached */ - if (ns.disk == D_FAILED && os.disk == D_DISKLESS) - rv = SS_IS_DISKLESS; - - return rv; -} - -static void print_sanitize_warnings(struct drbd_conf *mdev, enum sanitize_state_warnings warn) -{ - static const char *msg_table[] = { - [NO_WARNING] = "", - [ABORTED_ONLINE_VERIFY] = "Online-verify aborted.", - [ABORTED_RESYNC] = "Resync aborted.", - [CONNECTION_LOST_NEGOTIATING] = "Connection lost while negotiating, no data!", - [IMPLICITLY_UPGRADED_DISK] = "Implicitly upgraded disk", - [IMPLICITLY_UPGRADED_PDSK] = "Implicitly upgraded pdsk", - }; - - if (warn != NO_WARNING) - dev_warn(DEV, "%s\n", msg_table[warn]); -} - -/** - * sanitize_state() - Resolves implicitly necessary additional changes to a state transition - * @mdev: DRBD device. - * @os: old state. - * @ns: new state. - * @warn_sync_abort: - * - * When we loose connection, we have to set the state of the peers disk (pdsk) - * to D_UNKNOWN. This rule and many more along those lines are in this function. - */ -static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state ns, - enum sanitize_state_warnings *warn) -{ - enum drbd_fencing_p fp; - enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max; - - if (warn) - *warn = NO_WARNING; - - fp = FP_DONT_CARE; - if (get_ldev(mdev)) { - rcu_read_lock(); - fp = rcu_dereference(mdev->ldev->disk_conf)->fencing; - rcu_read_unlock(); - put_ldev(mdev); - } - - /* Implications from connection to peer and peer_isp */ - if (ns.conn < C_CONNECTED) { - ns.peer_isp = 0; - ns.peer = R_UNKNOWN; - if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT) - ns.pdsk = D_UNKNOWN; - } - - /* Clear the aftr_isp when becoming unconfigured */ - if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY) - ns.aftr_isp = 0; - - /* An implication of the disk states onto the connection state */ - /* Abort resync if a disk fails/detaches */ - if (ns.conn > C_CONNECTED && (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) { - if (warn) - *warn = ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T ? - ABORTED_ONLINE_VERIFY : ABORTED_RESYNC; - ns.conn = C_CONNECTED; - } - - /* Connection breaks down before we finished "Negotiating" */ - if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING && - get_ldev_if_state(mdev, D_NEGOTIATING)) { - if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) { - ns.disk = mdev->new_state_tmp.disk; - ns.pdsk = mdev->new_state_tmp.pdsk; - } else { - if (warn) - *warn = CONNECTION_LOST_NEGOTIATING; - ns.disk = D_DISKLESS; - ns.pdsk = D_UNKNOWN; - } - put_ldev(mdev); - } - - /* D_CONSISTENT and D_OUTDATED vanish when we get connected */ - if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) { - if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED) - ns.disk = D_UP_TO_DATE; - if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED) - ns.pdsk = D_UP_TO_DATE; - } - - /* Implications of the connection stat on the disk states */ - disk_min = D_DISKLESS; - disk_max = D_UP_TO_DATE; - pdsk_min = D_INCONSISTENT; - pdsk_max = D_UNKNOWN; - switch ((enum drbd_conns)ns.conn) { - case C_WF_BITMAP_T: - case C_PAUSED_SYNC_T: - case C_STARTING_SYNC_T: - case C_WF_SYNC_UUID: - case C_BEHIND: - disk_min = D_INCONSISTENT; - disk_max = D_OUTDATED; - pdsk_min = D_UP_TO_DATE; - pdsk_max = D_UP_TO_DATE; - break; - case C_VERIFY_S: - case C_VERIFY_T: - disk_min = D_UP_TO_DATE; - disk_max = D_UP_TO_DATE; - pdsk_min = D_UP_TO_DATE; - pdsk_max = D_UP_TO_DATE; - break; - case C_CONNECTED: - disk_min = D_DISKLESS; - disk_max = D_UP_TO_DATE; - pdsk_min = D_DISKLESS; - pdsk_max = D_UP_TO_DATE; - break; - case C_WF_BITMAP_S: - case C_PAUSED_SYNC_S: - case C_STARTING_SYNC_S: - case C_AHEAD: - disk_min = D_UP_TO_DATE; - disk_max = D_UP_TO_DATE; - pdsk_min = D_INCONSISTENT; - pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/ - break; - case C_SYNC_TARGET: - disk_min = D_INCONSISTENT; - disk_max = D_INCONSISTENT; - pdsk_min = D_UP_TO_DATE; - pdsk_max = D_UP_TO_DATE; - break; - case C_SYNC_SOURCE: - disk_min = D_UP_TO_DATE; - disk_max = D_UP_TO_DATE; - pdsk_min = D_INCONSISTENT; - pdsk_max = D_INCONSISTENT; - break; - case C_STANDALONE: - case C_DISCONNECTING: - case C_UNCONNECTED: - case C_TIMEOUT: - case C_BROKEN_PIPE: - case C_NETWORK_FAILURE: - case C_PROTOCOL_ERROR: - case C_TEAR_DOWN: - case C_WF_CONNECTION: - case C_WF_REPORT_PARAMS: - case C_MASK: - break; - } - if (ns.disk > disk_max) - ns.disk = disk_max; - - if (ns.disk < disk_min) { - if (warn) - *warn = IMPLICITLY_UPGRADED_DISK; - ns.disk = disk_min; - } - if (ns.pdsk > pdsk_max) - ns.pdsk = pdsk_max; - - if (ns.pdsk < pdsk_min) { - if (warn) - *warn = IMPLICITLY_UPGRADED_PDSK; - ns.pdsk = pdsk_min; - } - - if (fp == FP_STONITH && - (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED)) - ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */ - - if (mdev->tconn->res_opts.on_no_data == OND_SUSPEND_IO && - (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)) - ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */ - - if (ns.aftr_isp || ns.peer_isp || ns.user_isp) { - if (ns.conn == C_SYNC_SOURCE) - ns.conn = C_PAUSED_SYNC_S; - if (ns.conn == C_SYNC_TARGET) - ns.conn = C_PAUSED_SYNC_T; - } else { - if (ns.conn == C_PAUSED_SYNC_S) - ns.conn = C_SYNC_SOURCE; - if (ns.conn == C_PAUSED_SYNC_T) - ns.conn = C_SYNC_TARGET; - } - - return ns; -} - -void drbd_resume_al(struct drbd_conf *mdev) -{ - if (test_and_clear_bit(AL_SUSPENDED, &mdev->flags)) - dev_info(DEV, "Resumed AL updates\n"); -} - -/* helper for __drbd_set_state */ -static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs) -{ - if (mdev->tconn->agreed_pro_version < 90) - mdev->ov_start_sector = 0; - mdev->rs_total = drbd_bm_bits(mdev); - mdev->ov_position = 0; - if (cs == C_VERIFY_T) { - /* starting online verify from an arbitrary position - * does not fit well into the existing protocol. - * on C_VERIFY_T, we initialize ov_left and friends - * implicitly in receive_DataRequest once the - * first P_OV_REQUEST is received */ - mdev->ov_start_sector = ~(sector_t)0; - } else { - unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector); - if (bit >= mdev->rs_total) { - mdev->ov_start_sector = - BM_BIT_TO_SECT(mdev->rs_total - 1); - mdev->rs_total = 1; - } else - mdev->rs_total -= bit; - mdev->ov_position = mdev->ov_start_sector; - } - mdev->ov_left = mdev->rs_total; -} - -/** - * __drbd_set_state() - Set a new DRBD state - * @mdev: DRBD device. - * @ns: new state. - * @flags: Flags - * @done: Optional completion, that will get completed after the after_state_ch() finished - * - * Caller needs to hold req_lock, and global_state_lock. Do not call directly. - */ -enum drbd_state_rv -__drbd_set_state(struct drbd_conf *mdev, union drbd_state ns, - enum chg_state_flags flags, struct completion *done) -{ - union drbd_state os; - enum drbd_state_rv rv = SS_SUCCESS; - enum sanitize_state_warnings ssw; - struct after_state_chg_work *ascw; - - os = drbd_read_state(mdev); - - ns = sanitize_state(mdev, ns, &ssw); - if (ns.i == os.i) - return SS_NOTHING_TO_DO; - - rv = is_valid_transition(os, ns); - if (rv < SS_SUCCESS) - return rv; - - if (!(flags & CS_HARD)) { - /* pre-state-change checks ; only look at ns */ - /* See drbd_state_sw_errors in drbd_strings.c */ - - rv = is_valid_state(mdev, ns); - if (rv < SS_SUCCESS) { - /* If the old state was illegal as well, then let - this happen...*/ - - if (is_valid_state(mdev, os) == rv) - rv = is_valid_soft_transition(os, ns, mdev->tconn); - } else - rv = is_valid_soft_transition(os, ns, mdev->tconn); - } - - if (rv < SS_SUCCESS) { - if (flags & CS_VERBOSE) - print_st_err(mdev, os, ns, rv); - return rv; - } - - print_sanitize_warnings(mdev, ssw); - - drbd_pr_state_change(mdev, os, ns, flags); - - /* Display changes to the susp* flags that where caused by the call to - sanitize_state(). Only display it here if we where not called from - _conn_request_state() */ - if (!(flags & CS_DC_SUSP)) - conn_pr_state_change(mdev->tconn, os, ns, (flags & ~CS_DC_MASK) | CS_DC_SUSP); - - /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference - * on the ldev here, to be sure the transition -> D_DISKLESS resp. - * drbd_ldev_destroy() won't happen before our corresponding - * after_state_ch works run, where we put_ldev again. */ - if ((os.disk != D_FAILED && ns.disk == D_FAILED) || - (os.disk != D_DISKLESS && ns.disk == D_DISKLESS)) - atomic_inc(&mdev->local_cnt); - - mdev->state.i = ns.i; - mdev->tconn->susp = ns.susp; - mdev->tconn->susp_nod = ns.susp_nod; - mdev->tconn->susp_fen = ns.susp_fen; - - if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING) - drbd_print_uuids(mdev, "attached to UUIDs"); - - /* Wake up role changes, that were delayed because of connection establishing */ - if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS && - no_peer_wf_report_params(mdev->tconn)) - clear_bit(STATE_SENT, &mdev->tconn->flags); - - wake_up(&mdev->misc_wait); - wake_up(&mdev->state_wait); - wake_up(&mdev->tconn->ping_wait); - - /* Aborted verify run, or we reached the stop sector. - * Log the last position, unless end-of-device. */ - if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) && - ns.conn <= C_CONNECTED) { - mdev->ov_start_sector = - BM_BIT_TO_SECT(drbd_bm_bits(mdev) - mdev->ov_left); - if (mdev->ov_left) - dev_info(DEV, "Online Verify reached sector %llu\n", - (unsigned long long)mdev->ov_start_sector); - } - - if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) && - (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) { - dev_info(DEV, "Syncer continues.\n"); - mdev->rs_paused += (long)jiffies - -(long)mdev->rs_mark_time[mdev->rs_last_mark]; - if (ns.conn == C_SYNC_TARGET) - mod_timer(&mdev->resync_timer, jiffies); - } - - if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) && - (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) { - dev_info(DEV, "Resync suspended\n"); - mdev->rs_mark_time[mdev->rs_last_mark] = jiffies; - } - - if (os.conn == C_CONNECTED && - (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) { - unsigned long now = jiffies; - int i; - - set_ov_position(mdev, ns.conn); - mdev->rs_start = now; - mdev->rs_last_events = 0; - mdev->rs_last_sect_ev = 0; - mdev->ov_last_oos_size = 0; - mdev->ov_last_oos_start = 0; - - for (i = 0; i < DRBD_SYNC_MARKS; i++) { - mdev->rs_mark_left[i] = mdev->ov_left; - mdev->rs_mark_time[i] = now; - } - - drbd_rs_controller_reset(mdev); - - if (ns.conn == C_VERIFY_S) { - dev_info(DEV, "Starting Online Verify from sector %llu\n", - (unsigned long long)mdev->ov_position); - mod_timer(&mdev->resync_timer, jiffies); - } - } - - if (get_ldev(mdev)) { - u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND| - MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE| - MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY); - - mdf &= ~MDF_AL_CLEAN; - if (test_bit(CRASHED_PRIMARY, &mdev->flags)) - mdf |= MDF_CRASHED_PRIMARY; - if (mdev->state.role == R_PRIMARY || - (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY)) - mdf |= MDF_PRIMARY_IND; - if (mdev->state.conn > C_WF_REPORT_PARAMS) - mdf |= MDF_CONNECTED_IND; - if (mdev->state.disk > D_INCONSISTENT) - mdf |= MDF_CONSISTENT; - if (mdev->state.disk > D_OUTDATED) - mdf |= MDF_WAS_UP_TO_DATE; - if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT) - mdf |= MDF_PEER_OUT_DATED; - if (mdf != mdev->ldev->md.flags) { - mdev->ldev->md.flags = mdf; - drbd_md_mark_dirty(mdev); - } - if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT) - drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]); - put_ldev(mdev); - } - - /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */ - if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT && - os.peer == R_SECONDARY && ns.peer == R_PRIMARY) - set_bit(CONSIDER_RESYNC, &mdev->flags); - - /* Receiver should clean up itself */ - if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING) - drbd_thread_stop_nowait(&mdev->tconn->receiver); - - /* Now the receiver finished cleaning up itself, it should die */ - if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE) - drbd_thread_stop_nowait(&mdev->tconn->receiver); - - /* Upon network failure, we need to restart the receiver. */ - if (os.conn > C_WF_CONNECTION && - ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT) - drbd_thread_restart_nowait(&mdev->tconn->receiver); - - /* Resume AL writing if we get a connection */ - if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) - drbd_resume_al(mdev); - - /* remember last attach time so request_timer_fn() won't - * kill newly established sessions while we are still trying to thaw - * previously frozen IO */ - if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) && - ns.disk > D_NEGOTIATING) - mdev->last_reattach_jif = jiffies; - - ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC); - if (ascw) { - ascw->os = os; - ascw->ns = ns; - ascw->flags = flags; - ascw->w.cb = w_after_state_ch; - ascw->w.mdev = mdev; - ascw->done = done; - drbd_queue_work(&mdev->tconn->sender_work, &ascw->w); - } else { - dev_err(DEV, "Could not kmalloc an ascw\n"); - } - - return rv; -} - -static int w_after_state_ch(struct drbd_work *w, int unused) -{ - struct after_state_chg_work *ascw = - container_of(w, struct after_state_chg_work, w); - struct drbd_conf *mdev = w->mdev; - - after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags); - if (ascw->flags & CS_WAIT_COMPLETE) { - D_ASSERT(ascw->done != NULL); - complete(ascw->done); - } - kfree(ascw); - - return 0; -} - -static void abw_start_sync(struct drbd_conf *mdev, int rv) -{ - if (rv) { - dev_err(DEV, "Writing the bitmap failed not starting resync.\n"); - _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE); - return; - } - - switch (mdev->state.conn) { - case C_STARTING_SYNC_T: - _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE); - break; - case C_STARTING_SYNC_S: - drbd_start_resync(mdev, C_SYNC_SOURCE); - break; - } -} - -int drbd_bitmap_io_from_worker(struct drbd_conf *mdev, - int (*io_fn)(struct drbd_conf *), - char *why, enum bm_flag flags) -{ - int rv; - - D_ASSERT(current == mdev->tconn->worker.task); - - /* open coded non-blocking drbd_suspend_io(mdev); */ - set_bit(SUSPEND_IO, &mdev->flags); - - drbd_bm_lock(mdev, why, flags); - rv = io_fn(mdev); - drbd_bm_unlock(mdev); - - drbd_resume_io(mdev); - - return rv; -} - -/** - * after_state_ch() - Perform after state change actions that may sleep - * @mdev: DRBD device. - * @os: old state. - * @ns: new state. - * @flags: Flags - */ -static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, - union drbd_state ns, enum chg_state_flags flags) -{ - struct sib_info sib; - - sib.sib_reason = SIB_STATE_CHANGE; - sib.os = os; - sib.ns = ns; - - if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) { - clear_bit(CRASHED_PRIMARY, &mdev->flags); - if (mdev->p_uuid) - mdev->p_uuid[UI_FLAGS] &= ~((u64)2); - } - - /* Inform userspace about the change... */ - drbd_bcast_event(mdev, &sib); - - if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) && - (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)) - drbd_khelper(mdev, "pri-on-incon-degr"); - - /* Here we have the actions that are performed after a - state change. This function might sleep */ - - if (ns.susp_nod) { - struct drbd_tconn *tconn = mdev->tconn; - enum drbd_req_event what = NOTHING; - - spin_lock_irq(&tconn->req_lock); - if (os.conn < C_CONNECTED && conn_lowest_conn(tconn) >= C_CONNECTED) - what = RESEND; - - if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) && - conn_lowest_disk(tconn) > D_NEGOTIATING) - what = RESTART_FROZEN_DISK_IO; - - if (tconn->susp_nod && what != NOTHING) { - _tl_restart(tconn, what); - _conn_request_state(tconn, - (union drbd_state) { { .susp_nod = 1 } }, - (union drbd_state) { { .susp_nod = 0 } }, - CS_VERBOSE); - } - spin_unlock_irq(&tconn->req_lock); - } - - if (ns.susp_fen) { - struct drbd_tconn *tconn = mdev->tconn; - - spin_lock_irq(&tconn->req_lock); - if (tconn->susp_fen && conn_lowest_conn(tconn) >= C_CONNECTED) { - /* case2: The connection was established again: */ - struct drbd_conf *odev; - int vnr; - - rcu_read_lock(); - idr_for_each_entry(&tconn->volumes, odev, vnr) - clear_bit(NEW_CUR_UUID, &odev->flags); - rcu_read_unlock(); - _tl_restart(tconn, RESEND); - _conn_request_state(tconn, - (union drbd_state) { { .susp_fen = 1 } }, - (union drbd_state) { { .susp_fen = 0 } }, - CS_VERBOSE); - } - spin_unlock_irq(&tconn->req_lock); - } - - /* Became sync source. With protocol >= 96, we still need to send out - * the sync uuid now. Need to do that before any drbd_send_state, or - * the other side may go "paused sync" before receiving the sync uuids, - * which is unexpected. */ - if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) && - (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) && - mdev->tconn->agreed_pro_version >= 96 && get_ldev(mdev)) { - drbd_gen_and_send_sync_uuid(mdev); - put_ldev(mdev); - } - - /* Do not change the order of the if above and the two below... */ - if (os.pdsk == D_DISKLESS && - ns.pdsk > D_DISKLESS && ns.pdsk != D_UNKNOWN) { /* attach on the peer */ - /* we probably will start a resync soon. - * make sure those things are properly reset. */ - mdev->rs_total = 0; - mdev->rs_failed = 0; - atomic_set(&mdev->rs_pending_cnt, 0); - drbd_rs_cancel_all(mdev); - - drbd_send_uuids(mdev); - drbd_send_state(mdev, ns); - } - /* No point in queuing send_bitmap if we don't have a connection - * anymore, so check also the _current_ state, not only the new state - * at the time this work was queued. */ - if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S && - mdev->state.conn == C_WF_BITMAP_S) - drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL, - "send_bitmap (WFBitMapS)", - BM_LOCKED_TEST_ALLOWED); - - /* Lost contact to peer's copy of the data */ - if ((os.pdsk >= D_INCONSISTENT && - os.pdsk != D_UNKNOWN && - os.pdsk != D_OUTDATED) - && (ns.pdsk < D_INCONSISTENT || - ns.pdsk == D_UNKNOWN || - ns.pdsk == D_OUTDATED)) { - if (get_ldev(mdev)) { - if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) && - mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { - if (drbd_suspended(mdev)) { - set_bit(NEW_CUR_UUID, &mdev->flags); - } else { - drbd_uuid_new_current(mdev); - drbd_send_uuids(mdev); - } - } - put_ldev(mdev); - } - } - - if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) { - if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY && - mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { - drbd_uuid_new_current(mdev); - drbd_send_uuids(mdev); - } - /* D_DISKLESS Peer becomes secondary */ - if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY) - /* We may still be Primary ourselves. - * No harm done if the bitmap still changes, - * redirtied pages will follow later. */ - drbd_bitmap_io_from_worker(mdev, &drbd_bm_write, - "demote diskless peer", BM_LOCKED_SET_ALLOWED); - put_ldev(mdev); - } - - /* Write out all changed bits on demote. - * Though, no need to da that just yet - * if there is a resync going on still */ - if (os.role == R_PRIMARY && ns.role == R_SECONDARY && - mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) { - /* No changes to the bitmap expected this time, so assert that, - * even though no harm was done if it did change. */ - drbd_bitmap_io_from_worker(mdev, &drbd_bm_write, - "demote", BM_LOCKED_TEST_ALLOWED); - put_ldev(mdev); - } - - /* Last part of the attaching process ... */ - if (ns.conn >= C_CONNECTED && - os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) { - drbd_send_sizes(mdev, 0, 0); /* to start sync... */ - drbd_send_uuids(mdev); - drbd_send_state(mdev, ns); - } - - /* We want to pause/continue resync, tell peer. */ - if (ns.conn >= C_CONNECTED && - ((os.aftr_isp != ns.aftr_isp) || - (os.user_isp != ns.user_isp))) - drbd_send_state(mdev, ns); - - /* In case one of the isp bits got set, suspend other devices. */ - if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) && - (ns.aftr_isp || ns.peer_isp || ns.user_isp)) - suspend_other_sg(mdev); - - /* Make sure the peer gets informed about eventual state - changes (ISP bits) while we were in WFReportParams. */ - if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED) - drbd_send_state(mdev, ns); - - if (os.conn != C_AHEAD && ns.conn == C_AHEAD) - drbd_send_state(mdev, ns); - - /* We are in the progress to start a full sync... */ - if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || - (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S)) - /* no other bitmap changes expected during this phase */ - drbd_queue_bitmap_io(mdev, - &drbd_bmio_set_n_write, &abw_start_sync, - "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED); - - /* We are invalidating our self... */ - if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED && - os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT) - /* other bitmap operation expected during this phase */ - drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, - "set_n_write from invalidate", BM_LOCKED_MASK); - - /* first half of local IO error, failure to attach, - * or administrative detach */ - if (os.disk != D_FAILED && ns.disk == D_FAILED) { - enum drbd_io_error_p eh = EP_PASS_ON; - int was_io_error = 0; - /* corresponding get_ldev was in __drbd_set_state, to serialize - * our cleanup here with the transition to D_DISKLESS. - * But is is still not save to dreference ldev here, since - * we might come from an failed Attach before ldev was set. */ - if (mdev->ldev) { - rcu_read_lock(); - eh = rcu_dereference(mdev->ldev->disk_conf)->on_io_error; - rcu_read_unlock(); - - was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags); - - if (was_io_error && eh == EP_CALL_HELPER) - drbd_khelper(mdev, "local-io-error"); - - /* Immediately allow completion of all application IO, - * that waits for completion from the local disk, - * if this was a force-detach due to disk_timeout - * or administrator request (drbdsetup detach --force). - * Do NOT abort otherwise. - * Aborting local requests may cause serious problems, - * if requests are completed to upper layers already, - * and then later the already submitted local bio completes. - * This can cause DMA into former bio pages that meanwhile - * have been re-used for other things. - * So aborting local requests may cause crashes, - * or even worse, silent data corruption. - */ - if (test_and_clear_bit(FORCE_DETACH, &mdev->flags)) - tl_abort_disk_io(mdev); - - /* current state still has to be D_FAILED, - * there is only one way out: to D_DISKLESS, - * and that may only happen after our put_ldev below. */ - if (mdev->state.disk != D_FAILED) - dev_err(DEV, - "ASSERT FAILED: disk is %s during detach\n", - drbd_disk_str(mdev->state.disk)); - - if (ns.conn >= C_CONNECTED) - drbd_send_state(mdev, ns); - - drbd_rs_cancel_all(mdev); - - /* In case we want to get something to stable storage still, - * this may be the last chance. - * Following put_ldev may transition to D_DISKLESS. */ - drbd_md_sync(mdev); - } - put_ldev(mdev); - } - - /* second half of local IO error, failure to attach, - * or administrative detach, - * after local_cnt references have reached zero again */ - if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) { - /* We must still be diskless, - * re-attach has to be serialized with this! */ - if (mdev->state.disk != D_DISKLESS) - dev_err(DEV, - "ASSERT FAILED: disk is %s while going diskless\n", - drbd_disk_str(mdev->state.disk)); - - if (ns.conn >= C_CONNECTED) - drbd_send_state(mdev, ns); - /* corresponding get_ldev in __drbd_set_state - * this may finally trigger drbd_ldev_destroy. */ - put_ldev(mdev); - } - - /* Notify peer that I had a local IO error, and did not detached.. */ - if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED) - drbd_send_state(mdev, ns); - - /* Disks got bigger while they were detached */ - if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING && - test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) { - if (ns.conn == C_CONNECTED) - resync_after_online_grow(mdev); - } - - /* A resync finished or aborted, wake paused devices... */ - if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) || - (os.peer_isp && !ns.peer_isp) || - (os.user_isp && !ns.user_isp)) - resume_next_sg(mdev); - - /* sync target done with resync. Explicitly notify peer, even though - * it should (at least for non-empty resyncs) already know itself. */ - if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED) - drbd_send_state(mdev, ns); - - /* Verify finished, or reached stop sector. Peer did not know about - * the stop sector, and we may even have changed the stop sector during - * verify to interrupt/stop early. Send the new state. */ - if (os.conn == C_VERIFY_S && ns.conn == C_CONNECTED - && verify_can_do_stop_sector(mdev)) - drbd_send_state(mdev, ns); - - /* This triggers bitmap writeout of potentially still unwritten pages - * if the resync finished cleanly, or aborted because of peer disk - * failure, or because of connection loss. - * For resync aborted because of local disk failure, we cannot do - * any bitmap writeout anymore. - * No harm done if some bits change during this phase. - */ - if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) { - drbd_queue_bitmap_io(mdev, &drbd_bm_write_copy_pages, NULL, - "write from resync_finished", BM_LOCKED_CHANGE_ALLOWED); - put_ldev(mdev); - } - - if (ns.disk == D_DISKLESS && - ns.conn == C_STANDALONE && - ns.role == R_SECONDARY) { - if (os.aftr_isp != ns.aftr_isp) - resume_next_sg(mdev); - } - - drbd_md_sync(mdev); -} - -struct after_conn_state_chg_work { - struct drbd_work w; - enum drbd_conns oc; - union drbd_state ns_min; - union drbd_state ns_max; /* new, max state, over all mdevs */ - enum chg_state_flags flags; -}; - -static int w_after_conn_state_ch(struct drbd_work *w, int unused) -{ - struct after_conn_state_chg_work *acscw = - container_of(w, struct after_conn_state_chg_work, w); - struct drbd_tconn *tconn = w->tconn; - enum drbd_conns oc = acscw->oc; - union drbd_state ns_max = acscw->ns_max; - struct drbd_conf *mdev; - int vnr; - - kfree(acscw); - - /* Upon network configuration, we need to start the receiver */ - if (oc == C_STANDALONE && ns_max.conn == C_UNCONNECTED) - drbd_thread_start(&tconn->receiver); - - if (oc == C_DISCONNECTING && ns_max.conn == C_STANDALONE) { - struct net_conf *old_conf; - - mutex_lock(&tconn->conf_update); - old_conf = tconn->net_conf; - tconn->my_addr_len = 0; - tconn->peer_addr_len = 0; - rcu_assign_pointer(tconn->net_conf, NULL); - conn_free_crypto(tconn); - mutex_unlock(&tconn->conf_update); - - synchronize_rcu(); - kfree(old_conf); - } - - if (ns_max.susp_fen) { - /* case1: The outdate peer handler is successful: */ - if (ns_max.pdsk <= D_OUTDATED) { - rcu_read_lock(); - idr_for_each_entry(&tconn->volumes, mdev, vnr) { - if (test_bit(NEW_CUR_UUID, &mdev->flags)) { - drbd_uuid_new_current(mdev); - clear_bit(NEW_CUR_UUID, &mdev->flags); - } - } - rcu_read_unlock(); - spin_lock_irq(&tconn->req_lock); - _tl_restart(tconn, CONNECTION_LOST_WHILE_PENDING); - _conn_request_state(tconn, - (union drbd_state) { { .susp_fen = 1 } }, - (union drbd_state) { { .susp_fen = 0 } }, - CS_VERBOSE); - spin_unlock_irq(&tconn->req_lock); - } - } - kref_put(&tconn->kref, &conn_destroy); - - conn_md_sync(tconn); - - return 0; -} - -void conn_old_common_state(struct drbd_tconn *tconn, union drbd_state *pcs, enum chg_state_flags *pf) -{ - enum chg_state_flags flags = ~0; - struct drbd_conf *mdev; - int vnr, first_vol = 1; - union drbd_dev_state os, cs = { - { .role = R_SECONDARY, - .peer = R_UNKNOWN, - .conn = tconn->cstate, - .disk = D_DISKLESS, - .pdsk = D_UNKNOWN, - } }; - - rcu_read_lock(); - idr_for_each_entry(&tconn->volumes, mdev, vnr) { - os = mdev->state; - - if (first_vol) { - cs = os; - first_vol = 0; - continue; - } - - if (cs.role != os.role) - flags &= ~CS_DC_ROLE; - - if (cs.peer != os.peer) - flags &= ~CS_DC_PEER; - - if (cs.conn != os.conn) - flags &= ~CS_DC_CONN; - - if (cs.disk != os.disk) - flags &= ~CS_DC_DISK; - - if (cs.pdsk != os.pdsk) - flags &= ~CS_DC_PDSK; - } - rcu_read_unlock(); - - *pf |= CS_DC_MASK; - *pf &= flags; - (*pcs).i = cs.i; -} - -static enum drbd_state_rv -conn_is_valid_transition(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val, - enum chg_state_flags flags) -{ - enum drbd_state_rv rv = SS_SUCCESS; - union drbd_state ns, os; - struct drbd_conf *mdev; - int vnr; - - rcu_read_lock(); - idr_for_each_entry(&tconn->volumes, mdev, vnr) { - os = drbd_read_state(mdev); - ns = sanitize_state(mdev, apply_mask_val(os, mask, val), NULL); - - if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED) - ns.disk = os.disk; - - if (ns.i == os.i) - continue; - - rv = is_valid_transition(os, ns); - if (rv < SS_SUCCESS) - break; - - if (!(flags & CS_HARD)) { - rv = is_valid_state(mdev, ns); - if (rv < SS_SUCCESS) { - if (is_valid_state(mdev, os) == rv) - rv = is_valid_soft_transition(os, ns, tconn); - } else - rv = is_valid_soft_transition(os, ns, tconn); - } - if (rv < SS_SUCCESS) - break; - } - rcu_read_unlock(); - - if (rv < SS_SUCCESS && flags & CS_VERBOSE) - print_st_err(mdev, os, ns, rv); - - return rv; -} - -void -conn_set_state(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val, - union drbd_state *pns_min, union drbd_state *pns_max, enum chg_state_flags flags) -{ - union drbd_state ns, os, ns_max = { }; - union drbd_state ns_min = { - { .role = R_MASK, - .peer = R_MASK, - .conn = val.conn, - .disk = D_MASK, - .pdsk = D_MASK - } }; - struct drbd_conf *mdev; - enum drbd_state_rv rv; - int vnr, number_of_volumes = 0; - - if (mask.conn == C_MASK) { - /* remember last connect time so request_timer_fn() won't - * kill newly established sessions while we are still trying to thaw - * previously frozen IO */ - if (tconn->cstate != C_WF_REPORT_PARAMS && val.conn == C_WF_REPORT_PARAMS) - tconn->last_reconnect_jif = jiffies; - - tconn->cstate = val.conn; - } - - rcu_read_lock(); - idr_for_each_entry(&tconn->volumes, mdev, vnr) { - number_of_volumes++; - os = drbd_read_state(mdev); - ns = apply_mask_val(os, mask, val); - ns = sanitize_state(mdev, ns, NULL); - - if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED) - ns.disk = os.disk; - - rv = __drbd_set_state(mdev, ns, flags, NULL); - if (rv < SS_SUCCESS) - BUG(); - - ns.i = mdev->state.i; - ns_max.role = max_role(ns.role, ns_max.role); - ns_max.peer = max_role(ns.peer, ns_max.peer); - ns_max.conn = max_t(enum drbd_conns, ns.conn, ns_max.conn); - ns_max.disk = max_t(enum drbd_disk_state, ns.disk, ns_max.disk); - ns_max.pdsk = max_t(enum drbd_disk_state, ns.pdsk, ns_max.pdsk); - - ns_min.role = min_role(ns.role, ns_min.role); - ns_min.peer = min_role(ns.peer, ns_min.peer); - ns_min.conn = min_t(enum drbd_conns, ns.conn, ns_min.conn); - ns_min.disk = min_t(enum drbd_disk_state, ns.disk, ns_min.disk); - ns_min.pdsk = min_t(enum drbd_disk_state, ns.pdsk, ns_min.pdsk); - } - rcu_read_unlock(); - - if (number_of_volumes == 0) { - ns_min = ns_max = (union drbd_state) { { - .role = R_SECONDARY, - .peer = R_UNKNOWN, - .conn = val.conn, - .disk = D_DISKLESS, - .pdsk = D_UNKNOWN - } }; - } - - ns_min.susp = ns_max.susp = tconn->susp; - ns_min.susp_nod = ns_max.susp_nod = tconn->susp_nod; - ns_min.susp_fen = ns_max.susp_fen = tconn->susp_fen; - - *pns_min = ns_min; - *pns_max = ns_max; -} - -static enum drbd_state_rv -_conn_rq_cond(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val) -{ - enum drbd_state_rv rv; - - if (test_and_clear_bit(CONN_WD_ST_CHG_OKAY, &tconn->flags)) - return SS_CW_SUCCESS; - - if (test_and_clear_bit(CONN_WD_ST_CHG_FAIL, &tconn->flags)) - return SS_CW_FAILED_BY_PEER; - - rv = tconn->cstate != C_WF_REPORT_PARAMS ? SS_CW_NO_NEED : SS_UNKNOWN_ERROR; - - if (rv == SS_UNKNOWN_ERROR) - rv = conn_is_valid_transition(tconn, mask, val, 0); - - if (rv == SS_SUCCESS) - rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */ - - return rv; -} - -enum drbd_state_rv -_conn_request_state(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val, - enum chg_state_flags flags) -{ - enum drbd_state_rv rv = SS_SUCCESS; - struct after_conn_state_chg_work *acscw; - enum drbd_conns oc = tconn->cstate; - union drbd_state ns_max, ns_min, os; - bool have_mutex = false; - - if (mask.conn) { - rv = is_valid_conn_transition(oc, val.conn); - if (rv < SS_SUCCESS) - goto abort; - } - - rv = conn_is_valid_transition(tconn, mask, val, flags); - if (rv < SS_SUCCESS) - goto abort; - - if (oc == C_WF_REPORT_PARAMS && val.conn == C_DISCONNECTING && - !(flags & (CS_LOCAL_ONLY | CS_HARD))) { - - /* This will be a cluster-wide state change. - * Need to give up the spinlock, grab the mutex, - * then send the state change request, ... */ - spin_unlock_irq(&tconn->req_lock); - mutex_lock(&tconn->cstate_mutex); - have_mutex = true; - - set_bit(CONN_WD_ST_CHG_REQ, &tconn->flags); - if (conn_send_state_req(tconn, mask, val)) { - /* sending failed. */ - clear_bit(CONN_WD_ST_CHG_REQ, &tconn->flags); - rv = SS_CW_FAILED_BY_PEER; - /* need to re-aquire the spin lock, though */ - goto abort_unlocked; - } - - if (val.conn == C_DISCONNECTING) - set_bit(DISCONNECT_SENT, &tconn->flags); - - /* ... and re-aquire the spinlock. - * If _conn_rq_cond() returned >= SS_SUCCESS, we must call - * conn_set_state() within the same spinlock. */ - spin_lock_irq(&tconn->req_lock); - wait_event_lock_irq(tconn->ping_wait, - (rv = _conn_rq_cond(tconn, mask, val)), - tconn->req_lock); - clear_bit(CONN_WD_ST_CHG_REQ, &tconn->flags); - if (rv < SS_SUCCESS) - goto abort; - } - - conn_old_common_state(tconn, &os, &flags); - flags |= CS_DC_SUSP; - conn_set_state(tconn, mask, val, &ns_min, &ns_max, flags); - conn_pr_state_change(tconn, os, ns_max, flags); - - acscw = kmalloc(sizeof(*acscw), GFP_ATOMIC); - if (acscw) { - acscw->oc = os.conn; - acscw->ns_min = ns_min; - acscw->ns_max = ns_max; - acscw->flags = flags; - acscw->w.cb = w_after_conn_state_ch; - kref_get(&tconn->kref); - acscw->w.tconn = tconn; - drbd_queue_work(&tconn->sender_work, &acscw->w); - } else { - conn_err(tconn, "Could not kmalloc an acscw\n"); - } - - abort: - if (have_mutex) { - /* mutex_unlock() "... must not be used in interrupt context.", - * so give up the spinlock, then re-aquire it */ - spin_unlock_irq(&tconn->req_lock); - abort_unlocked: - mutex_unlock(&tconn->cstate_mutex); - spin_lock_irq(&tconn->req_lock); - } - if (rv < SS_SUCCESS && flags & CS_VERBOSE) { - conn_err(tconn, "State change failed: %s\n", drbd_set_st_err_str(rv)); - conn_err(tconn, " mask = 0x%x val = 0x%x\n", mask.i, val.i); - conn_err(tconn, " old_conn:%s wanted_conn:%s\n", drbd_conn_str(oc), drbd_conn_str(val.conn)); - } - return rv; -} - -enum drbd_state_rv -conn_request_state(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val, - enum chg_state_flags flags) -{ - enum drbd_state_rv rv; - - spin_lock_irq(&tconn->req_lock); - rv = _conn_request_state(tconn, mask, val, flags); - spin_unlock_irq(&tconn->req_lock); - - return rv; -} diff --git a/trunk/drivers/block/drbd/drbd_state.h b/trunk/drivers/block/drbd/drbd_state.h deleted file mode 100644 index a3c361bbc4b6..000000000000 --- a/trunk/drivers/block/drbd/drbd_state.h +++ /dev/null @@ -1,161 +0,0 @@ -#ifndef DRBD_STATE_H -#define DRBD_STATE_H - -struct drbd_conf; -struct drbd_tconn; - -/** - * DOC: DRBD State macros - * - * These macros are used to express state changes in easily readable form. - * - * The NS macros expand to a mask and a value, that can be bit ored onto the - * current state as soon as the spinlock (req_lock) was taken. - * - * The _NS macros are used for state functions that get called with the - * spinlock. These macros expand directly to the new state value. - * - * Besides the basic forms NS() and _NS() additional _?NS[23] are defined - * to express state changes that affect more than one aspect of the state. - * - * E.g. NS2(conn, C_CONNECTED, peer, R_SECONDARY) - * Means that the network connection was established and that the peer - * is in secondary role. - */ -#define role_MASK R_MASK -#define peer_MASK R_MASK -#define disk_MASK D_MASK -#define pdsk_MASK D_MASK -#define conn_MASK C_MASK -#define susp_MASK 1 -#define user_isp_MASK 1 -#define aftr_isp_MASK 1 -#define susp_nod_MASK 1 -#define susp_fen_MASK 1 - -#define NS(T, S) \ - ({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \ - ({ union drbd_state val; val.i = 0; val.T = (S); val; }) -#define NS2(T1, S1, T2, S2) \ - ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \ - mask.T2 = T2##_MASK; mask; }), \ - ({ union drbd_state val; val.i = 0; val.T1 = (S1); \ - val.T2 = (S2); val; }) -#define NS3(T1, S1, T2, S2, T3, S3) \ - ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \ - mask.T2 = T2##_MASK; mask.T3 = T3##_MASK; mask; }), \ - ({ union drbd_state val; val.i = 0; val.T1 = (S1); \ - val.T2 = (S2); val.T3 = (S3); val; }) - -#define _NS(D, T, S) \ - D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T = (S); __ns; }) -#define _NS2(D, T1, S1, T2, S2) \ - D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T1 = (S1); \ - __ns.T2 = (S2); __ns; }) -#define _NS3(D, T1, S1, T2, S2, T3, S3) \ - D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T1 = (S1); \ - __ns.T2 = (S2); __ns.T3 = (S3); __ns; }) - -enum chg_state_flags { - CS_HARD = 1 << 0, - CS_VERBOSE = 1 << 1, - CS_WAIT_COMPLETE = 1 << 2, - CS_SERIALIZE = 1 << 3, - CS_ORDERED = CS_WAIT_COMPLETE + CS_SERIALIZE, - CS_LOCAL_ONLY = 1 << 4, /* Do not consider a device pair wide state change */ - CS_DC_ROLE = 1 << 5, /* DC = display as connection state change */ - CS_DC_PEER = 1 << 6, - CS_DC_CONN = 1 << 7, - CS_DC_DISK = 1 << 8, - CS_DC_PDSK = 1 << 9, - CS_DC_SUSP = 1 << 10, - CS_DC_MASK = CS_DC_ROLE + CS_DC_PEER + CS_DC_CONN + CS_DC_DISK + CS_DC_PDSK, - CS_IGN_OUTD_FAIL = 1 << 11, -}; - -/* drbd_dev_state and drbd_state are different types. This is to stress the - small difference. There is no suspended flag (.susp), and no suspended - while fence handler runs flas (susp_fen). */ -union drbd_dev_state { - struct { -#if defined(__LITTLE_ENDIAN_BITFIELD) - unsigned role:2 ; /* 3/4 primary/secondary/unknown */ - unsigned peer:2 ; /* 3/4 primary/secondary/unknown */ - unsigned conn:5 ; /* 17/32 cstates */ - unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */ - unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */ - unsigned _unused:1 ; - unsigned aftr_isp:1 ; /* isp .. imposed sync pause */ - unsigned peer_isp:1 ; - unsigned user_isp:1 ; - unsigned _pad:11; /* 0 unused */ -#elif defined(__BIG_ENDIAN_BITFIELD) - unsigned _pad:11; - unsigned user_isp:1 ; - unsigned peer_isp:1 ; - unsigned aftr_isp:1 ; /* isp .. imposed sync pause */ - unsigned _unused:1 ; - unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */ - unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */ - unsigned conn:5 ; /* 17/32 cstates */ - unsigned peer:2 ; /* 3/4 primary/secondary/unknown */ - unsigned role:2 ; /* 3/4 primary/secondary/unknown */ -#else -# error "this endianess is not supported" -#endif - }; - unsigned int i; -}; - -extern enum drbd_state_rv drbd_change_state(struct drbd_conf *mdev, - enum chg_state_flags f, - union drbd_state mask, - union drbd_state val); -extern void drbd_force_state(struct drbd_conf *, union drbd_state, - union drbd_state); -extern enum drbd_state_rv _drbd_request_state(struct drbd_conf *, - union drbd_state, - union drbd_state, - enum chg_state_flags); -extern enum drbd_state_rv __drbd_set_state(struct drbd_conf *, union drbd_state, - enum chg_state_flags, - struct completion *done); -extern void print_st_err(struct drbd_conf *, union drbd_state, - union drbd_state, int); - -enum drbd_state_rv -_conn_request_state(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val, - enum chg_state_flags flags); - -enum drbd_state_rv -conn_request_state(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val, - enum chg_state_flags flags); - -extern void drbd_resume_al(struct drbd_conf *mdev); -extern bool conn_all_vols_unconf(struct drbd_tconn *tconn); - -/** - * drbd_request_state() - Reqest a state change - * @mdev: DRBD device. - * @mask: mask of state bits to change. - * @val: value of new state bits. - * - * This is the most graceful way of requesting a state change. It is verbose - * quite verbose in case the state change is not possible, and all those - * state changes are globally serialized. - */ -static inline int drbd_request_state(struct drbd_conf *mdev, - union drbd_state mask, - union drbd_state val) -{ - return _drbd_request_state(mdev, mask, val, CS_VERBOSE + CS_ORDERED); -} - -enum drbd_role conn_highest_role(struct drbd_tconn *tconn); -enum drbd_role conn_highest_peer(struct drbd_tconn *tconn); -enum drbd_disk_state conn_highest_disk(struct drbd_tconn *tconn); -enum drbd_disk_state conn_lowest_disk(struct drbd_tconn *tconn); -enum drbd_disk_state conn_highest_pdsk(struct drbd_tconn *tconn); -enum drbd_conns conn_lowest_conn(struct drbd_tconn *tconn); - -#endif diff --git a/trunk/drivers/block/drbd/drbd_strings.c b/trunk/drivers/block/drbd/drbd_strings.c index 9a664bd27404..c44a2a602772 100644 --- a/trunk/drivers/block/drbd/drbd_strings.c +++ b/trunk/drivers/block/drbd/drbd_strings.c @@ -89,7 +89,6 @@ static const char *drbd_state_sw_errors[] = { [-SS_LOWER_THAN_OUTDATED] = "Disk state is lower than outdated", [-SS_IN_TRANSIENT_STATE] = "In transient state, retry after next state change", [-SS_CONCURRENT_ST_CHG] = "Concurrent state changes detected and aborted", - [-SS_O_VOL_PEER_PRI] = "Other vol primary on peer not allowed by config", }; const char *drbd_conn_str(enum drbd_conns s) diff --git a/trunk/drivers/block/drbd/drbd_worker.c b/trunk/drivers/block/drbd/drbd_worker.c index 424dc7bdf9b7..6bce2cc179d4 100644 --- a/trunk/drivers/block/drbd/drbd_worker.c +++ b/trunk/drivers/block/drbd/drbd_worker.c @@ -38,13 +38,16 @@ #include "drbd_int.h" #include "drbd_req.h" -static int w_make_ov_request(struct drbd_work *w, int cancel); +static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel); +static int w_make_resync_request(struct drbd_conf *mdev, + struct drbd_work *w, int cancel); + /* endio handlers: * drbd_md_io_complete (defined here) - * drbd_request_endio (defined here) - * drbd_peer_request_endio (defined here) + * drbd_endio_pri (defined here) + * drbd_endio_sec (defined here) * bm_async_io_complete (defined in drbd_bitmap.c) * * For all these callbacks, note the following: @@ -57,7 +60,7 @@ static int w_make_ov_request(struct drbd_work *w, int cancel); /* About the global_state_lock Each state transition on an device holds a read lock. In case we have - to evaluate the resync after dependencies, we grab a write lock, because + to evaluate the sync after dependencies, we grab a write lock, because we need stable states on all devices for that. */ rwlock_t global_state_lock; @@ -95,93 +98,97 @@ void drbd_md_io_complete(struct bio *bio, int error) /* reads on behalf of the partner, * "submitted" by the receiver */ -void drbd_endio_read_sec_final(struct drbd_peer_request *peer_req) __releases(local) +void drbd_endio_read_sec_final(struct drbd_epoch_entry *e) __releases(local) { unsigned long flags = 0; - struct drbd_conf *mdev = peer_req->w.mdev; + struct drbd_conf *mdev = e->mdev; + + D_ASSERT(e->block_id != ID_VACANT); - spin_lock_irqsave(&mdev->tconn->req_lock, flags); - mdev->read_cnt += peer_req->i.size >> 9; - list_del(&peer_req->w.list); + spin_lock_irqsave(&mdev->req_lock, flags); + mdev->read_cnt += e->size >> 9; + list_del(&e->w.list); if (list_empty(&mdev->read_ee)) wake_up(&mdev->ee_wait); - if (test_bit(__EE_WAS_ERROR, &peer_req->flags)) - __drbd_chk_io_error(mdev, DRBD_READ_ERROR); - spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); + if (test_bit(__EE_WAS_ERROR, &e->flags)) + __drbd_chk_io_error(mdev, DRBD_IO_ERROR); + spin_unlock_irqrestore(&mdev->req_lock, flags); - drbd_queue_work(&mdev->tconn->sender_work, &peer_req->w); + drbd_queue_work(&mdev->data.work, &e->w); put_ldev(mdev); } /* writes on behalf of the partner, or resync writes, * "submitted" by the receiver, final stage. */ -static void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(local) +static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(local) { unsigned long flags = 0; - struct drbd_conf *mdev = peer_req->w.mdev; - struct drbd_interval i; + struct drbd_conf *mdev = e->mdev; + sector_t e_sector; int do_wake; - u64 block_id; + int is_syncer_req; int do_al_complete_io; - /* after we moved peer_req to done_ee, + D_ASSERT(e->block_id != ID_VACANT); + + /* after we moved e to done_ee, * we may no longer access it, * it may be freed/reused already! * (as soon as we release the req_lock) */ - i = peer_req->i; - do_al_complete_io = peer_req->flags & EE_CALL_AL_COMPLETE_IO; - block_id = peer_req->block_id; + e_sector = e->sector; + do_al_complete_io = e->flags & EE_CALL_AL_COMPLETE_IO; + is_syncer_req = is_syncer_block_id(e->block_id); - spin_lock_irqsave(&mdev->tconn->req_lock, flags); - mdev->writ_cnt += peer_req->i.size >> 9; - list_move_tail(&peer_req->w.list, &mdev->done_ee); + spin_lock_irqsave(&mdev->req_lock, flags); + mdev->writ_cnt += e->size >> 9; + list_del(&e->w.list); /* has been on active_ee or sync_ee */ + list_add_tail(&e->w.list, &mdev->done_ee); - /* - * Do not remove from the write_requests tree here: we did not send the - * Ack yet and did not wake possibly waiting conflicting requests. - * Removed from the tree from "drbd_process_done_ee" within the - * appropriate w.cb (e_end_block/e_end_resync_block) or from - * _drbd_clear_done_ee. - */ + /* No hlist_del_init(&e->collision) here, we did not send the Ack yet, + * neither did we wake possibly waiting conflicting requests. + * done from "drbd_process_done_ee" within the appropriate w.cb + * (e_end_block/e_end_resync_block) or from _drbd_clear_done_ee */ - do_wake = list_empty(block_id == ID_SYNCER ? &mdev->sync_ee : &mdev->active_ee); + do_wake = is_syncer_req + ? list_empty(&mdev->sync_ee) + : list_empty(&mdev->active_ee); - if (test_bit(__EE_WAS_ERROR, &peer_req->flags)) - __drbd_chk_io_error(mdev, DRBD_WRITE_ERROR); - spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); + if (test_bit(__EE_WAS_ERROR, &e->flags)) + __drbd_chk_io_error(mdev, DRBD_IO_ERROR); + spin_unlock_irqrestore(&mdev->req_lock, flags); - if (block_id == ID_SYNCER) - drbd_rs_complete_io(mdev, i.sector); + if (is_syncer_req) + drbd_rs_complete_io(mdev, e_sector); if (do_wake) wake_up(&mdev->ee_wait); if (do_al_complete_io) - drbd_al_complete_io(mdev, &i); + drbd_al_complete_io(mdev, e_sector); - wake_asender(mdev->tconn); + wake_asender(mdev); put_ldev(mdev); } /* writes on behalf of the partner, or resync writes, * "submitted" by the receiver. */ -void drbd_peer_request_endio(struct bio *bio, int error) +void drbd_endio_sec(struct bio *bio, int error) { - struct drbd_peer_request *peer_req = bio->bi_private; - struct drbd_conf *mdev = peer_req->w.mdev; + struct drbd_epoch_entry *e = bio->bi_private; + struct drbd_conf *mdev = e->mdev; int uptodate = bio_flagged(bio, BIO_UPTODATE); int is_write = bio_data_dir(bio) == WRITE; if (error && __ratelimit(&drbd_ratelimit_state)) dev_warn(DEV, "%s: error=%d s=%llus\n", is_write ? "write" : "read", error, - (unsigned long long)peer_req->i.sector); + (unsigned long long)e->sector); if (!error && !uptodate) { if (__ratelimit(&drbd_ratelimit_state)) dev_warn(DEV, "%s: setting error to -EIO s=%llus\n", is_write ? "write" : "read", - (unsigned long long)peer_req->i.sector); + (unsigned long long)e->sector); /* strange behavior of some lower level drivers... * fail the request by clearing the uptodate flag, * but do not return any error?! */ @@ -189,24 +196,24 @@ void drbd_peer_request_endio(struct bio *bio, int error) } if (error) - set_bit(__EE_WAS_ERROR, &peer_req->flags); + set_bit(__EE_WAS_ERROR, &e->flags); bio_put(bio); /* no need for the bio anymore */ - if (atomic_dec_and_test(&peer_req->pending_bios)) { + if (atomic_dec_and_test(&e->pending_bios)) { if (is_write) - drbd_endio_write_sec_final(peer_req); + drbd_endio_write_sec_final(e); else - drbd_endio_read_sec_final(peer_req); + drbd_endio_read_sec_final(e); } } /* read, readA or write requests on R_PRIMARY coming from drbd_make_request */ -void drbd_request_endio(struct bio *bio, int error) +void drbd_endio_pri(struct bio *bio, int error) { unsigned long flags; struct drbd_request *req = bio->bi_private; - struct drbd_conf *mdev = req->w.mdev; + struct drbd_conf *mdev = req->mdev; struct bio_and_error m; enum drbd_req_event what; int uptodate = bio_flagged(bio, BIO_UPTODATE); @@ -220,72 +227,53 @@ void drbd_request_endio(struct bio *bio, int error) error = -EIO; } - - /* If this request was aborted locally before, - * but now was completed "successfully", - * chances are that this caused arbitrary data corruption. - * - * "aborting" requests, or force-detaching the disk, is intended for - * completely blocked/hung local backing devices which do no longer - * complete requests at all, not even do error completions. In this - * situation, usually a hard-reset and failover is the only way out. - * - * By "aborting", basically faking a local error-completion, - * we allow for a more graceful swichover by cleanly migrating services. - * Still the affected node has to be rebooted "soon". - * - * By completing these requests, we allow the upper layers to re-use - * the associated data pages. - * - * If later the local backing device "recovers", and now DMAs some data - * from disk into the original request pages, in the best case it will - * just put random data into unused pages; but typically it will corrupt - * meanwhile completely unrelated data, causing all sorts of damage. - * - * Which means delayed successful completion, - * especially for READ requests, - * is a reason to panic(). - * - * We assume that a delayed *error* completion is OK, - * though we still will complain noisily about it. - */ - if (unlikely(req->rq_state & RQ_LOCAL_ABORTED)) { - if (__ratelimit(&drbd_ratelimit_state)) - dev_emerg(DEV, "delayed completion of aborted local request; disk-timeout may be too aggressive\n"); - - if (!error) - panic("possible random memory corruption caused by delayed completion of aborted local request\n"); - } - /* to avoid recursion in __req_mod */ if (unlikely(error)) { what = (bio_data_dir(bio) == WRITE) - ? WRITE_COMPLETED_WITH_ERROR + ? write_completed_with_error : (bio_rw(bio) == READ) - ? READ_COMPLETED_WITH_ERROR - : READ_AHEAD_COMPLETED_WITH_ERROR; + ? read_completed_with_error + : read_ahead_completed_with_error; } else - what = COMPLETED_OK; + what = completed_ok; bio_put(req->private_bio); req->private_bio = ERR_PTR(error); /* not req_mod(), we need irqsave here! */ - spin_lock_irqsave(&mdev->tconn->req_lock, flags); + spin_lock_irqsave(&mdev->req_lock, flags); __req_mod(req, what, &m); - spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); + spin_unlock_irqrestore(&mdev->req_lock, flags); put_ldev(mdev); if (m.bio) complete_master_bio(mdev, &m); } -void drbd_csum_ee(struct drbd_conf *mdev, struct crypto_hash *tfm, - struct drbd_peer_request *peer_req, void *digest) +int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +{ + struct drbd_request *req = container_of(w, struct drbd_request, w); + + /* We should not detach for read io-error, + * but try to WRITE the P_DATA_REPLY to the failed location, + * to give the disk the chance to relocate that block */ + + spin_lock_irq(&mdev->req_lock); + if (cancel || mdev->state.pdsk != D_UP_TO_DATE) { + _req_mod(req, read_retry_remote_canceled); + spin_unlock_irq(&mdev->req_lock); + return 1; + } + spin_unlock_irq(&mdev->req_lock); + + return w_send_read_req(mdev, w, 0); +} + +void drbd_csum_ee(struct drbd_conf *mdev, struct crypto_hash *tfm, struct drbd_epoch_entry *e, void *digest) { struct hash_desc desc; struct scatterlist sg; - struct page *page = peer_req->pages; + struct page *page = e->pages; struct page *tmp; unsigned len; @@ -302,7 +290,7 @@ void drbd_csum_ee(struct drbd_conf *mdev, struct crypto_hash *tfm, page = tmp; } /* and now the last, possibly only partially used page */ - len = peer_req->i.size & (PAGE_SIZE - 1); + len = e->size & (PAGE_SIZE - 1); sg_set_page(&sg, page, len ?: PAGE_SIZE, 0); crypto_hash_update(&desc, &sg, sg.length); crypto_hash_final(&desc, digest); @@ -328,58 +316,59 @@ void drbd_csum_bio(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio * crypto_hash_final(&desc, digest); } -/* MAYBE merge common code with w_e_end_ov_req */ -static int w_e_send_csum(struct drbd_work *w, int cancel) +/* TODO merge common code with w_e_end_ov_req */ +int w_e_send_csum(struct drbd_conf *mdev, struct drbd_work *w, int cancel) { - struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w); - struct drbd_conf *mdev = w->mdev; + struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); int digest_size; void *digest; - int err = 0; + int ok = 1; + + D_ASSERT(e->block_id == DRBD_MAGIC + 0xbeef); if (unlikely(cancel)) goto out; - if (unlikely((peer_req->flags & EE_WAS_ERROR) != 0)) + if (likely((e->flags & EE_WAS_ERROR) != 0)) goto out; - digest_size = crypto_hash_digestsize(mdev->tconn->csums_tfm); + digest_size = crypto_hash_digestsize(mdev->csums_tfm); digest = kmalloc(digest_size, GFP_NOIO); if (digest) { - sector_t sector = peer_req->i.sector; - unsigned int size = peer_req->i.size; - drbd_csum_ee(mdev, mdev->tconn->csums_tfm, peer_req, digest); - /* Free peer_req and pages before send. + sector_t sector = e->sector; + unsigned int size = e->size; + drbd_csum_ee(mdev, mdev->csums_tfm, e, digest); + /* Free e and pages before send. * In case we block on congestion, we could otherwise run into * some distributed deadlock, if the other side blocks on * congestion as well, because our receiver blocks in - * drbd_alloc_pages due to pp_in_use > max_buffers. */ - drbd_free_peer_req(mdev, peer_req); - peer_req = NULL; + * drbd_pp_alloc due to pp_in_use > max_buffers. */ + drbd_free_ee(mdev, e); + e = NULL; inc_rs_pending(mdev); - err = drbd_send_drequest_csum(mdev, sector, size, - digest, digest_size, - P_CSUM_RS_REQUEST); + ok = drbd_send_drequest_csum(mdev, sector, size, + digest, digest_size, + P_CSUM_RS_REQUEST); kfree(digest); } else { dev_err(DEV, "kmalloc() of digest failed.\n"); - err = -ENOMEM; + ok = 0; } out: - if (peer_req) - drbd_free_peer_req(mdev, peer_req); + if (e) + drbd_free_ee(mdev, e); - if (unlikely(err)) + if (unlikely(!ok)) dev_err(DEV, "drbd_send_drequest(..., csum) failed\n"); - return err; + return ok; } #define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN) static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size) { - struct drbd_peer_request *peer_req; + struct drbd_epoch_entry *e; if (!get_ldev(mdev)) return -EIO; @@ -389,47 +378,45 @@ static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size) /* GFP_TRY, because if there is no memory available right now, this may * be rescheduled for later. It is "only" background resync, after all. */ - peer_req = drbd_alloc_peer_req(mdev, ID_SYNCER /* unused */, sector, - size, GFP_TRY); - if (!peer_req) + e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY); + if (!e) goto defer; - peer_req->w.cb = w_e_send_csum; - spin_lock_irq(&mdev->tconn->req_lock); - list_add(&peer_req->w.list, &mdev->read_ee); - spin_unlock_irq(&mdev->tconn->req_lock); + e->w.cb = w_e_send_csum; + spin_lock_irq(&mdev->req_lock); + list_add(&e->w.list, &mdev->read_ee); + spin_unlock_irq(&mdev->req_lock); atomic_add(size >> 9, &mdev->rs_sect_ev); - if (drbd_submit_peer_request(mdev, peer_req, READ, DRBD_FAULT_RS_RD) == 0) + if (drbd_submit_ee(mdev, e, READ, DRBD_FAULT_RS_RD) == 0) return 0; /* If it failed because of ENOMEM, retry should help. If it failed * because bio_add_page failed (probably broken lower level driver), * retry may or may not help. * If it does not, you may need to force disconnect. */ - spin_lock_irq(&mdev->tconn->req_lock); - list_del(&peer_req->w.list); - spin_unlock_irq(&mdev->tconn->req_lock); + spin_lock_irq(&mdev->req_lock); + list_del(&e->w.list); + spin_unlock_irq(&mdev->req_lock); - drbd_free_peer_req(mdev, peer_req); + drbd_free_ee(mdev, e); defer: put_ldev(mdev); return -EAGAIN; } -int w_resync_timer(struct drbd_work *w, int cancel) +int w_resync_timer(struct drbd_conf *mdev, struct drbd_work *w, int cancel) { - struct drbd_conf *mdev = w->mdev; switch (mdev->state.conn) { case C_VERIFY_S: - w_make_ov_request(w, cancel); + w_make_ov_request(mdev, w, cancel); break; case C_SYNC_TARGET: - w_make_resync_request(w, cancel); + w_make_resync_request(mdev, w, cancel); break; } - return 0; + return 1; } void resync_timer_fn(unsigned long data) @@ -437,7 +424,7 @@ void resync_timer_fn(unsigned long data) struct drbd_conf *mdev = (struct drbd_conf *) data; if (list_empty(&mdev->resync_work.list)) - drbd_queue_work(&mdev->tconn->sender_work, &mdev->resync_work); + drbd_queue_work(&mdev->data.work, &mdev->resync_work); } static void fifo_set(struct fifo_buffer *fb, int value) @@ -469,24 +456,8 @@ static void fifo_add_val(struct fifo_buffer *fb, int value) fb->values[i] += value; } -struct fifo_buffer *fifo_alloc(int fifo_size) -{ - struct fifo_buffer *fb; - - fb = kzalloc(sizeof(struct fifo_buffer) + sizeof(int) * fifo_size, GFP_NOIO); - if (!fb) - return NULL; - - fb->head_index = 0; - fb->size = fifo_size; - fb->total = 0; - - return fb; -} - static int drbd_rs_controller(struct drbd_conf *mdev) { - struct disk_conf *dc; unsigned int sect_in; /* Number of sectors that came in since the last turn */ unsigned int want; /* The number of sectors we want in the proxy */ int req_sect; /* Number of sectors to request in this turn */ @@ -495,39 +466,38 @@ static int drbd_rs_controller(struct drbd_conf *mdev) int steps; /* Number of time steps to plan ahead */ int curr_corr; int max_sect; - struct fifo_buffer *plan; sect_in = atomic_xchg(&mdev->rs_sect_in, 0); /* Number of sectors that came in */ mdev->rs_in_flight -= sect_in; - dc = rcu_dereference(mdev->ldev->disk_conf); - plan = rcu_dereference(mdev->rs_plan_s); + spin_lock(&mdev->peer_seq_lock); /* get an atomic view on mdev->rs_plan_s */ - steps = plan->size; /* (dc->c_plan_ahead * 10 * SLEEP_TIME) / HZ; */ + steps = mdev->rs_plan_s.size; /* (mdev->sync_conf.c_plan_ahead * 10 * SLEEP_TIME) / HZ; */ if (mdev->rs_in_flight + sect_in == 0) { /* At start of resync */ - want = ((dc->resync_rate * 2 * SLEEP_TIME) / HZ) * steps; + want = ((mdev->sync_conf.rate * 2 * SLEEP_TIME) / HZ) * steps; } else { /* normal path */ - want = dc->c_fill_target ? dc->c_fill_target : - sect_in * dc->c_delay_target * HZ / (SLEEP_TIME * 10); + want = mdev->sync_conf.c_fill_target ? mdev->sync_conf.c_fill_target : + sect_in * mdev->sync_conf.c_delay_target * HZ / (SLEEP_TIME * 10); } - correction = want - mdev->rs_in_flight - plan->total; + correction = want - mdev->rs_in_flight - mdev->rs_planed; /* Plan ahead */ cps = correction / steps; - fifo_add_val(plan, cps); - plan->total += cps * steps; + fifo_add_val(&mdev->rs_plan_s, cps); + mdev->rs_planed += cps * steps; /* What we do in this step */ - curr_corr = fifo_push(plan, 0); - plan->total -= curr_corr; + curr_corr = fifo_push(&mdev->rs_plan_s, 0); + spin_unlock(&mdev->peer_seq_lock); + mdev->rs_planed -= curr_corr; req_sect = sect_in + curr_corr; if (req_sect < 0) req_sect = 0; - max_sect = (dc->c_max_rate * 2 * SLEEP_TIME) / HZ; + max_sect = (mdev->sync_conf.c_max_rate * 2 * SLEEP_TIME) / HZ; if (req_sect > max_sect) req_sect = max_sect; @@ -543,25 +513,22 @@ static int drbd_rs_controller(struct drbd_conf *mdev) static int drbd_rs_number_requests(struct drbd_conf *mdev) { int number; - - rcu_read_lock(); - if (rcu_dereference(mdev->rs_plan_s)->size) { + if (mdev->rs_plan_s.size) { /* mdev->sync_conf.c_plan_ahead */ number = drbd_rs_controller(mdev) >> (BM_BLOCK_SHIFT - 9); mdev->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME; } else { - mdev->c_sync_rate = rcu_dereference(mdev->ldev->disk_conf)->resync_rate; + mdev->c_sync_rate = mdev->sync_conf.rate; number = SLEEP_TIME * mdev->c_sync_rate / ((BM_BLOCK_SIZE / 1024) * HZ); } - rcu_read_unlock(); /* ignore the amount of pending requests, the resync controller should * throttle down to incoming reply rate soon enough anyways. */ return number; } -int w_make_resync_request(struct drbd_work *w, int cancel) +static int w_make_resync_request(struct drbd_conf *mdev, + struct drbd_work *w, int cancel) { - struct drbd_conf *mdev = w->mdev; unsigned long bit; sector_t sector; const sector_t capacity = drbd_get_capacity(mdev->this_bdev); @@ -571,12 +538,12 @@ int w_make_resync_request(struct drbd_work *w, int cancel) int i = 0; if (unlikely(cancel)) - return 0; + return 1; if (mdev->rs_total == 0) { /* empty resync? */ drbd_resync_finished(mdev); - return 0; + return 1; } if (!get_ldev(mdev)) { @@ -585,7 +552,7 @@ int w_make_resync_request(struct drbd_work *w, int cancel) to continue resync with a broken disk makes no sense at all */ dev_err(DEV, "Disk broke down during resync!\n"); - return 0; + return 1; } max_bio_size = queue_max_hw_sectors(mdev->rq_queue) << 9; @@ -595,15 +562,15 @@ int w_make_resync_request(struct drbd_work *w, int cancel) for (i = 0; i < number; i++) { /* Stop generating RS requests, when half of the send buffer is filled */ - mutex_lock(&mdev->tconn->data.mutex); - if (mdev->tconn->data.socket) { - queued = mdev->tconn->data.socket->sk->sk_wmem_queued; - sndbuf = mdev->tconn->data.socket->sk->sk_sndbuf; + mutex_lock(&mdev->data.mutex); + if (mdev->data.socket) { + queued = mdev->data.socket->sk->sk_wmem_queued; + sndbuf = mdev->data.socket->sk->sk_sndbuf; } else { queued = 1; sndbuf = 0; } - mutex_unlock(&mdev->tconn->data.mutex); + mutex_unlock(&mdev->data.mutex); if (queued > sndbuf / 2) goto requeue; @@ -614,7 +581,7 @@ int w_make_resync_request(struct drbd_work *w, int cancel) if (bit == DRBD_END_OF_BITMAP) { mdev->bm_resync_fo = drbd_bm_bits(mdev); put_ldev(mdev); - return 0; + return 1; } sector = BM_BIT_TO_SECT(bit); @@ -673,11 +640,11 @@ int w_make_resync_request(struct drbd_work *w, int cancel) /* adjust very last sectors, in case we are oddly sized */ if (sector + (size>>9) > capacity) size = (capacity-sector)<<9; - if (mdev->tconn->agreed_pro_version >= 89 && mdev->tconn->csums_tfm) { + if (mdev->agreed_pro_version >= 89 && mdev->csums_tfm) { switch (read_for_csum(mdev, sector, size)) { case -EIO: /* Disk failure */ put_ldev(mdev); - return -EIO; + return 0; case -EAGAIN: /* allocation failed, or ldev busy */ drbd_rs_complete_io(mdev, sector); mdev->bm_resync_fo = BM_SECT_TO_BIT(sector); @@ -690,16 +657,13 @@ int w_make_resync_request(struct drbd_work *w, int cancel) BUG(); } } else { - int err; - inc_rs_pending(mdev); - err = drbd_send_drequest(mdev, P_RS_DATA_REQUEST, - sector, size, ID_SYNCER); - if (err) { + if (!drbd_send_drequest(mdev, P_RS_DATA_REQUEST, + sector, size, ID_SYNCER)) { dev_err(DEV, "drbd_send_drequest() failed, aborting...\n"); dec_rs_pending(mdev); put_ldev(mdev); - return err; + return 0; } } } @@ -712,23 +676,21 @@ int w_make_resync_request(struct drbd_work *w, int cancel) * until then resync "work" is "inactive" ... */ put_ldev(mdev); - return 0; + return 1; } requeue: mdev->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9)); mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME); put_ldev(mdev); - return 0; + return 1; } -static int w_make_ov_request(struct drbd_work *w, int cancel) +static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel) { - struct drbd_conf *mdev = w->mdev; int number, i, size; sector_t sector; const sector_t capacity = drbd_get_capacity(mdev->this_bdev); - bool stop_sector_reached = false; if (unlikely(cancel)) return 1; @@ -737,17 +699,9 @@ static int w_make_ov_request(struct drbd_work *w, int cancel) sector = mdev->ov_position; for (i = 0; i < number; i++) { - if (sector >= capacity) + if (sector >= capacity) { return 1; - - /* We check for "finished" only in the reply path: - * w_e_end_ov_reply(). - * We need to send at least one request out. */ - stop_sector_reached = i > 0 - && verify_can_do_stop_sector(mdev) - && sector >= mdev->ov_stop_sector; - if (stop_sector_reached) - break; + } size = BM_BLOCK_SIZE; @@ -761,7 +715,7 @@ static int w_make_ov_request(struct drbd_work *w, int cancel) size = (capacity-sector)<<9; inc_rs_pending(mdev); - if (drbd_send_ov_request(mdev, sector, size)) { + if (!drbd_send_ov_request(mdev, sector, size)) { dec_rs_pending(mdev); return 0; } @@ -771,39 +725,56 @@ static int w_make_ov_request(struct drbd_work *w, int cancel) requeue: mdev->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9)); - if (i == 0 || !stop_sector_reached) - mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME); + mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME); + return 1; +} + + +void start_resync_timer_fn(unsigned long data) +{ + struct drbd_conf *mdev = (struct drbd_conf *) data; + + drbd_queue_work(&mdev->data.work, &mdev->start_resync_work); +} + +int w_start_resync(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +{ + if (atomic_read(&mdev->unacked_cnt) || atomic_read(&mdev->rs_pending_cnt)) { + dev_warn(DEV, "w_start_resync later...\n"); + mdev->start_resync_timer.expires = jiffies + HZ/10; + add_timer(&mdev->start_resync_timer); + return 1; + } + + drbd_start_resync(mdev, C_SYNC_SOURCE); + clear_bit(AHEAD_TO_SYNC_SOURCE, &mdev->flags); return 1; } -int w_ov_finished(struct drbd_work *w, int cancel) +int w_ov_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel) { - struct drbd_conf *mdev = w->mdev; kfree(w); - ov_out_of_sync_print(mdev); + ov_oos_print(mdev); drbd_resync_finished(mdev); - return 0; + return 1; } -static int w_resync_finished(struct drbd_work *w, int cancel) +static int w_resync_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel) { - struct drbd_conf *mdev = w->mdev; kfree(w); drbd_resync_finished(mdev); - return 0; + return 1; } static void ping_peer(struct drbd_conf *mdev) { - struct drbd_tconn *tconn = mdev->tconn; - - clear_bit(GOT_PING_ACK, &tconn->flags); - request_ping(tconn); - wait_event(tconn->ping_wait, - test_bit(GOT_PING_ACK, &tconn->flags) || mdev->state.conn < C_CONNECTED); + clear_bit(GOT_PING_ACK, &mdev->flags); + request_ping(mdev); + wait_event(mdev->misc_wait, + test_bit(GOT_PING_ACK, &mdev->flags) || mdev->state.conn < C_CONNECTED); } int drbd_resync_finished(struct drbd_conf *mdev) @@ -828,8 +799,7 @@ int drbd_resync_finished(struct drbd_conf *mdev) w = kmalloc(sizeof(struct drbd_work), GFP_ATOMIC); if (w) { w->cb = w_resync_finished; - w->mdev = mdev; - drbd_queue_work(&mdev->tconn->sender_work, w); + drbd_queue_work(&mdev->data.work, w); return 1; } dev_err(DEV, "Warn failed to drbd_rs_del_all() and to kmalloc(w).\n"); @@ -838,12 +808,7 @@ int drbd_resync_finished(struct drbd_conf *mdev) dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ; if (dt <= 0) dt = 1; - db = mdev->rs_total; - /* adjust for verify start and stop sectors, respective reached position */ - if (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T) - db -= mdev->ov_left; - dbdt = Bit2KB(db/dt); mdev->rs_paused /= HZ; @@ -852,8 +817,8 @@ int drbd_resync_finished(struct drbd_conf *mdev) ping_peer(mdev); - spin_lock_irq(&mdev->tconn->req_lock); - os = drbd_read_state(mdev); + spin_lock_irq(&mdev->req_lock); + os = mdev->state; verify_done = (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T); @@ -866,7 +831,7 @@ int drbd_resync_finished(struct drbd_conf *mdev) ns.conn = C_CONNECTED; dev_info(DEV, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n", - verify_done ? "Online verify" : "Resync", + verify_done ? "Online verify " : "Resync", dt + mdev->rs_paused, mdev->rs_paused, dbdt); n_oos = drbd_bm_total_weight(mdev); @@ -883,7 +848,7 @@ int drbd_resync_finished(struct drbd_conf *mdev) if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) khelper_cmd = "after-resync-target"; - if (mdev->tconn->csums_tfm && mdev->rs_total) { + if (mdev->csums_tfm && mdev->rs_total) { const unsigned long s = mdev->rs_same_csum; const unsigned long t = mdev->rs_total; const int ratio = @@ -941,15 +906,13 @@ int drbd_resync_finished(struct drbd_conf *mdev) _drbd_set_state(mdev, ns, CS_VERBOSE, NULL); out_unlock: - spin_unlock_irq(&mdev->tconn->req_lock); + spin_unlock_irq(&mdev->req_lock); put_ldev(mdev); out: mdev->rs_total = 0; mdev->rs_failed = 0; mdev->rs_paused = 0; - - /* reset start sector, if we reached end of device */ - if (verify_done && mdev->ov_left == 0) + if (verify_done) mdev->ov_start_sector = 0; drbd_md_sync(mdev); @@ -961,19 +924,19 @@ int drbd_resync_finished(struct drbd_conf *mdev) } /* helper */ -static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_peer_request *peer_req) +static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_entry *e) { - if (drbd_peer_req_has_active_page(peer_req)) { + if (drbd_ee_has_active_page(e)) { /* This might happen if sendpage() has not finished */ - int i = (peer_req->i.size + PAGE_SIZE -1) >> PAGE_SHIFT; + int i = (e->size + PAGE_SIZE -1) >> PAGE_SHIFT; atomic_add(i, &mdev->pp_in_use_by_net); atomic_sub(i, &mdev->pp_in_use); - spin_lock_irq(&mdev->tconn->req_lock); - list_add_tail(&peer_req->w.list, &mdev->net_ee); - spin_unlock_irq(&mdev->tconn->req_lock); + spin_lock_irq(&mdev->req_lock); + list_add_tail(&e->w.list, &mdev->net_ee); + spin_unlock_irq(&mdev->req_lock); wake_up(&drbd_pp_wait); } else - drbd_free_peer_req(mdev, peer_req); + drbd_free_ee(mdev, e); } /** @@ -982,177 +945,174 @@ static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_peer_requ * @w: work object. * @cancel: The connection will be closed anyways */ -int w_e_end_data_req(struct drbd_work *w, int cancel) +int w_e_end_data_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) { - struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w); - struct drbd_conf *mdev = w->mdev; - int err; + struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); + int ok; if (unlikely(cancel)) { - drbd_free_peer_req(mdev, peer_req); + drbd_free_ee(mdev, e); dec_unacked(mdev); - return 0; + return 1; } - if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { - err = drbd_send_block(mdev, P_DATA_REPLY, peer_req); + if (likely((e->flags & EE_WAS_ERROR) == 0)) { + ok = drbd_send_block(mdev, P_DATA_REPLY, e); } else { if (__ratelimit(&drbd_ratelimit_state)) dev_err(DEV, "Sending NegDReply. sector=%llus.\n", - (unsigned long long)peer_req->i.sector); + (unsigned long long)e->sector); - err = drbd_send_ack(mdev, P_NEG_DREPLY, peer_req); + ok = drbd_send_ack(mdev, P_NEG_DREPLY, e); } dec_unacked(mdev); - move_to_net_ee_or_free(mdev, peer_req); + move_to_net_ee_or_free(mdev, e); - if (unlikely(err)) + if (unlikely(!ok)) dev_err(DEV, "drbd_send_block() failed\n"); - return err; + return ok; } /** - * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUEST + * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUESTRS * @mdev: DRBD device. * @w: work object. * @cancel: The connection will be closed anyways */ -int w_e_end_rsdata_req(struct drbd_work *w, int cancel) +int w_e_end_rsdata_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) { - struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w); - struct drbd_conf *mdev = w->mdev; - int err; + struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); + int ok; if (unlikely(cancel)) { - drbd_free_peer_req(mdev, peer_req); + drbd_free_ee(mdev, e); dec_unacked(mdev); - return 0; + return 1; } if (get_ldev_if_state(mdev, D_FAILED)) { - drbd_rs_complete_io(mdev, peer_req->i.sector); + drbd_rs_complete_io(mdev, e->sector); put_ldev(mdev); } if (mdev->state.conn == C_AHEAD) { - err = drbd_send_ack(mdev, P_RS_CANCEL, peer_req); - } else if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { + ok = drbd_send_ack(mdev, P_RS_CANCEL, e); + } else if (likely((e->flags & EE_WAS_ERROR) == 0)) { if (likely(mdev->state.pdsk >= D_INCONSISTENT)) { inc_rs_pending(mdev); - err = drbd_send_block(mdev, P_RS_DATA_REPLY, peer_req); + ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e); } else { if (__ratelimit(&drbd_ratelimit_state)) dev_err(DEV, "Not sending RSDataReply, " "partner DISKLESS!\n"); - err = 0; + ok = 1; } } else { if (__ratelimit(&drbd_ratelimit_state)) dev_err(DEV, "Sending NegRSDReply. sector %llus.\n", - (unsigned long long)peer_req->i.sector); + (unsigned long long)e->sector); - err = drbd_send_ack(mdev, P_NEG_RS_DREPLY, peer_req); + ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e); /* update resync data with failure */ - drbd_rs_failed_io(mdev, peer_req->i.sector, peer_req->i.size); + drbd_rs_failed_io(mdev, e->sector, e->size); } dec_unacked(mdev); - move_to_net_ee_or_free(mdev, peer_req); + move_to_net_ee_or_free(mdev, e); - if (unlikely(err)) + if (unlikely(!ok)) dev_err(DEV, "drbd_send_block() failed\n"); - return err; + return ok; } -int w_e_end_csum_rs_req(struct drbd_work *w, int cancel) +int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) { - struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w); - struct drbd_conf *mdev = w->mdev; + struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); struct digest_info *di; int digest_size; void *digest = NULL; - int err, eq = 0; + int ok, eq = 0; if (unlikely(cancel)) { - drbd_free_peer_req(mdev, peer_req); + drbd_free_ee(mdev, e); dec_unacked(mdev); - return 0; + return 1; } if (get_ldev(mdev)) { - drbd_rs_complete_io(mdev, peer_req->i.sector); + drbd_rs_complete_io(mdev, e->sector); put_ldev(mdev); } - di = peer_req->digest; + di = e->digest; - if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { + if (likely((e->flags & EE_WAS_ERROR) == 0)) { /* quick hack to try to avoid a race against reconfiguration. * a real fix would be much more involved, * introducing more locking mechanisms */ - if (mdev->tconn->csums_tfm) { - digest_size = crypto_hash_digestsize(mdev->tconn->csums_tfm); + if (mdev->csums_tfm) { + digest_size = crypto_hash_digestsize(mdev->csums_tfm); D_ASSERT(digest_size == di->digest_size); digest = kmalloc(digest_size, GFP_NOIO); } if (digest) { - drbd_csum_ee(mdev, mdev->tconn->csums_tfm, peer_req, digest); + drbd_csum_ee(mdev, mdev->csums_tfm, e, digest); eq = !memcmp(digest, di->digest, digest_size); kfree(digest); } if (eq) { - drbd_set_in_sync(mdev, peer_req->i.sector, peer_req->i.size); + drbd_set_in_sync(mdev, e->sector, e->size); /* rs_same_csums unit is BM_BLOCK_SIZE */ - mdev->rs_same_csum += peer_req->i.size >> BM_BLOCK_SHIFT; - err = drbd_send_ack(mdev, P_RS_IS_IN_SYNC, peer_req); + mdev->rs_same_csum += e->size >> BM_BLOCK_SHIFT; + ok = drbd_send_ack(mdev, P_RS_IS_IN_SYNC, e); } else { inc_rs_pending(mdev); - peer_req->block_id = ID_SYNCER; /* By setting block_id, digest pointer becomes invalid! */ - peer_req->flags &= ~EE_HAS_DIGEST; /* This peer request no longer has a digest pointer */ + e->block_id = ID_SYNCER; /* By setting block_id, digest pointer becomes invalid! */ + e->flags &= ~EE_HAS_DIGEST; /* This e no longer has a digest pointer */ kfree(di); - err = drbd_send_block(mdev, P_RS_DATA_REPLY, peer_req); + ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e); } } else { - err = drbd_send_ack(mdev, P_NEG_RS_DREPLY, peer_req); + ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e); if (__ratelimit(&drbd_ratelimit_state)) dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n"); } dec_unacked(mdev); - move_to_net_ee_or_free(mdev, peer_req); + move_to_net_ee_or_free(mdev, e); - if (unlikely(err)) + if (unlikely(!ok)) dev_err(DEV, "drbd_send_block/ack() failed\n"); - return err; + return ok; } -int w_e_end_ov_req(struct drbd_work *w, int cancel) +/* TODO merge common code with w_e_send_csum */ +int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) { - struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w); - struct drbd_conf *mdev = w->mdev; - sector_t sector = peer_req->i.sector; - unsigned int size = peer_req->i.size; + struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); + sector_t sector = e->sector; + unsigned int size = e->size; int digest_size; void *digest; - int err = 0; + int ok = 1; if (unlikely(cancel)) goto out; - digest_size = crypto_hash_digestsize(mdev->tconn->verify_tfm); + digest_size = crypto_hash_digestsize(mdev->verify_tfm); digest = kmalloc(digest_size, GFP_NOIO); if (!digest) { - err = 1; /* terminate the connection in case the allocation failed */ + ok = 0; /* terminate the connection in case the allocation failed */ goto out; } - if (likely(!(peer_req->flags & EE_WAS_ERROR))) - drbd_csum_ee(mdev, mdev->tconn->verify_tfm, peer_req, digest); + if (likely(!(e->flags & EE_WAS_ERROR))) + drbd_csum_ee(mdev, mdev->verify_tfm, e, digest); else memset(digest, 0, digest_size); @@ -1160,23 +1120,25 @@ int w_e_end_ov_req(struct drbd_work *w, int cancel) * In case we block on congestion, we could otherwise run into * some distributed deadlock, if the other side blocks on * congestion as well, because our receiver blocks in - * drbd_alloc_pages due to pp_in_use > max_buffers. */ - drbd_free_peer_req(mdev, peer_req); - peer_req = NULL; + * drbd_pp_alloc due to pp_in_use > max_buffers. */ + drbd_free_ee(mdev, e); + e = NULL; inc_rs_pending(mdev); - err = drbd_send_drequest_csum(mdev, sector, size, digest, digest_size, P_OV_REPLY); - if (err) + ok = drbd_send_drequest_csum(mdev, sector, size, + digest, digest_size, + P_OV_REPLY); + if (!ok) dec_rs_pending(mdev); kfree(digest); out: - if (peer_req) - drbd_free_peer_req(mdev, peer_req); + if (e) + drbd_free_ee(mdev, e); dec_unacked(mdev); - return err; + return ok; } -void drbd_ov_out_of_sync_found(struct drbd_conf *mdev, sector_t sector, int size) +void drbd_ov_oos_found(struct drbd_conf *mdev, sector_t sector, int size) { if (mdev->ov_last_oos_start + mdev->ov_last_oos_size == sector) { mdev->ov_last_oos_size += size>>9; @@ -1187,38 +1149,36 @@ void drbd_ov_out_of_sync_found(struct drbd_conf *mdev, sector_t sector, int size drbd_set_out_of_sync(mdev, sector, size); } -int w_e_end_ov_reply(struct drbd_work *w, int cancel) +int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel) { - struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w); - struct drbd_conf *mdev = w->mdev; + struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); struct digest_info *di; void *digest; - sector_t sector = peer_req->i.sector; - unsigned int size = peer_req->i.size; + sector_t sector = e->sector; + unsigned int size = e->size; int digest_size; - int err, eq = 0; - bool stop_sector_reached = false; + int ok, eq = 0; if (unlikely(cancel)) { - drbd_free_peer_req(mdev, peer_req); + drbd_free_ee(mdev, e); dec_unacked(mdev); - return 0; + return 1; } /* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all * the resync lru has been cleaned up already */ if (get_ldev(mdev)) { - drbd_rs_complete_io(mdev, peer_req->i.sector); + drbd_rs_complete_io(mdev, e->sector); put_ldev(mdev); } - di = peer_req->digest; + di = e->digest; - if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { - digest_size = crypto_hash_digestsize(mdev->tconn->verify_tfm); + if (likely((e->flags & EE_WAS_ERROR) == 0)) { + digest_size = crypto_hash_digestsize(mdev->verify_tfm); digest = kmalloc(digest_size, GFP_NOIO); if (digest) { - drbd_csum_ee(mdev, mdev->tconn->verify_tfm, peer_req, digest); + drbd_csum_ee(mdev, mdev->verify_tfm, e, digest); D_ASSERT(digest_size == di->digest_size); eq = !memcmp(digest, di->digest, digest_size); @@ -1226,19 +1186,19 @@ int w_e_end_ov_reply(struct drbd_work *w, int cancel) } } - /* Free peer_req and pages before send. - * In case we block on congestion, we could otherwise run into - * some distributed deadlock, if the other side blocks on - * congestion as well, because our receiver blocks in - * drbd_alloc_pages due to pp_in_use > max_buffers. */ - drbd_free_peer_req(mdev, peer_req); + /* Free e and pages before send. + * In case we block on congestion, we could otherwise run into + * some distributed deadlock, if the other side blocks on + * congestion as well, because our receiver blocks in + * drbd_pp_alloc due to pp_in_use > max_buffers. */ + drbd_free_ee(mdev, e); if (!eq) - drbd_ov_out_of_sync_found(mdev, sector, size); + drbd_ov_oos_found(mdev, sector, size); else - ov_out_of_sync_print(mdev); + ov_oos_print(mdev); - err = drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size, - eq ? ID_IN_SYNC : ID_OUT_OF_SYNC); + ok = drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size, + eq ? ID_IN_SYNC : ID_OUT_OF_SYNC); dec_unacked(mdev); @@ -1248,102 +1208,73 @@ int w_e_end_ov_reply(struct drbd_work *w, int cancel) if ((mdev->ov_left & 0x200) == 0x200) drbd_advance_rs_marks(mdev, mdev->ov_left); - stop_sector_reached = verify_can_do_stop_sector(mdev) && - (sector + (size>>9)) >= mdev->ov_stop_sector; - - if (mdev->ov_left == 0 || stop_sector_reached) { - ov_out_of_sync_print(mdev); + if (mdev->ov_left == 0) { + ov_oos_print(mdev); drbd_resync_finished(mdev); } - return err; + return ok; } -int w_prev_work_done(struct drbd_work *w, int cancel) +int w_prev_work_done(struct drbd_conf *mdev, struct drbd_work *w, int cancel) { struct drbd_wq_barrier *b = container_of(w, struct drbd_wq_barrier, w); - complete(&b->done); - return 0; -} - -/* FIXME - * We need to track the number of pending barrier acks, - * and to be able to wait for them. - * See also comment in drbd_adm_attach before drbd_suspend_io. - */ -int drbd_send_barrier(struct drbd_tconn *tconn) -{ - struct p_barrier *p; - struct drbd_socket *sock; - - sock = &tconn->data; - p = conn_prepare_command(tconn, sock); - if (!p) - return -EIO; - p->barrier = tconn->send.current_epoch_nr; - p->pad = 0; - tconn->send.current_epoch_writes = 0; - - return conn_send_command(tconn, sock, P_BARRIER, sizeof(*p), NULL, 0); + return 1; } -int w_send_write_hint(struct drbd_work *w, int cancel) +int w_send_barrier(struct drbd_conf *mdev, struct drbd_work *w, int cancel) { - struct drbd_conf *mdev = w->mdev; - struct drbd_socket *sock; - + struct drbd_tl_epoch *b = container_of(w, struct drbd_tl_epoch, w); + struct p_barrier *p = &mdev->data.sbuf.barrier; + int ok = 1; + + /* really avoid racing with tl_clear. w.cb may have been referenced + * just before it was reassigned and re-queued, so double check that. + * actually, this race was harmless, since we only try to send the + * barrier packet here, and otherwise do nothing with the object. + * but compare with the head of w_clear_epoch */ + spin_lock_irq(&mdev->req_lock); + if (w->cb != w_send_barrier || mdev->state.conn < C_CONNECTED) + cancel = 1; + spin_unlock_irq(&mdev->req_lock); if (cancel) - return 0; - sock = &mdev->tconn->data; - if (!drbd_prepare_command(mdev, sock)) - return -EIO; - return drbd_send_command(mdev, sock, P_UNPLUG_REMOTE, 0, NULL, 0); -} + return 1; -static void re_init_if_first_write(struct drbd_tconn *tconn, unsigned int epoch) -{ - if (!tconn->send.seen_any_write_yet) { - tconn->send.seen_any_write_yet = true; - tconn->send.current_epoch_nr = epoch; - tconn->send.current_epoch_writes = 0; - } + if (!drbd_get_data_sock(mdev)) + return 0; + p->barrier = b->br_number; + /* inc_ap_pending was done where this was queued. + * dec_ap_pending will be done in got_BarrierAck + * or (on connection loss) in w_clear_epoch. */ + ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BARRIER, + (struct p_header80 *)p, sizeof(*p), 0); + drbd_put_data_sock(mdev); + + return ok; } -static void maybe_send_barrier(struct drbd_tconn *tconn, unsigned int epoch) +int w_send_write_hint(struct drbd_conf *mdev, struct drbd_work *w, int cancel) { - /* re-init if first write on this connection */ - if (!tconn->send.seen_any_write_yet) - return; - if (tconn->send.current_epoch_nr != epoch) { - if (tconn->send.current_epoch_writes) - drbd_send_barrier(tconn); - tconn->send.current_epoch_nr = epoch; - } + if (cancel) + return 1; + return drbd_send_short_cmd(mdev, P_UNPLUG_REMOTE); } -int w_send_out_of_sync(struct drbd_work *w, int cancel) +int w_send_oos(struct drbd_conf *mdev, struct drbd_work *w, int cancel) { struct drbd_request *req = container_of(w, struct drbd_request, w); - struct drbd_conf *mdev = w->mdev; - struct drbd_tconn *tconn = mdev->tconn; - int err; + int ok; if (unlikely(cancel)) { - req_mod(req, SEND_CANCELED); - return 0; + req_mod(req, send_canceled); + return 1; } - /* this time, no tconn->send.current_epoch_writes++; - * If it was sent, it was the closing barrier for the last - * replicated epoch, before we went into AHEAD mode. - * No more barriers will be sent, until we leave AHEAD mode again. */ - maybe_send_barrier(tconn, req->epoch); - - err = drbd_send_out_of_sync(mdev, req); - req_mod(req, OOS_HANDED_TO_NETWORK); + ok = drbd_send_oos(mdev, req); + req_mod(req, oos_handed_to_network); - return err; + return ok; } /** @@ -1352,26 +1283,20 @@ int w_send_out_of_sync(struct drbd_work *w, int cancel) * @w: work object. * @cancel: The connection will be closed anyways */ -int w_send_dblock(struct drbd_work *w, int cancel) +int w_send_dblock(struct drbd_conf *mdev, struct drbd_work *w, int cancel) { struct drbd_request *req = container_of(w, struct drbd_request, w); - struct drbd_conf *mdev = w->mdev; - struct drbd_tconn *tconn = mdev->tconn; - int err; + int ok; if (unlikely(cancel)) { - req_mod(req, SEND_CANCELED); - return 0; + req_mod(req, send_canceled); + return 1; } - re_init_if_first_write(tconn, req->epoch); - maybe_send_barrier(tconn, req->epoch); - tconn->send.current_epoch_writes++; - - err = drbd_send_dblock(mdev, req); - req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK); + ok = drbd_send_dblock(mdev, req); + req_mod(req, ok ? handed_over_to_network : send_failed); - return err; + return ok; } /** @@ -1380,61 +1305,57 @@ int w_send_dblock(struct drbd_work *w, int cancel) * @w: work object. * @cancel: The connection will be closed anyways */ -int w_send_read_req(struct drbd_work *w, int cancel) +int w_send_read_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) { struct drbd_request *req = container_of(w, struct drbd_request, w); - struct drbd_conf *mdev = w->mdev; - struct drbd_tconn *tconn = mdev->tconn; - int err; + int ok; if (unlikely(cancel)) { - req_mod(req, SEND_CANCELED); - return 0; + req_mod(req, send_canceled); + return 1; } - /* Even read requests may close a write epoch, - * if there was any yet. */ - maybe_send_barrier(tconn, req->epoch); + ok = drbd_send_drequest(mdev, P_DATA_REQUEST, req->sector, req->size, + (unsigned long)req); - err = drbd_send_drequest(mdev, P_DATA_REQUEST, req->i.sector, req->i.size, - (unsigned long)req); - - req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK); + if (!ok) { + /* ?? we set C_TIMEOUT or C_BROKEN_PIPE in drbd_send(); + * so this is probably redundant */ + if (mdev->state.conn >= C_CONNECTED) + drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE)); + } + req_mod(req, ok ? handed_over_to_network : send_failed); - return err; + return ok; } -int w_restart_disk_io(struct drbd_work *w, int cancel) +int w_restart_disk_io(struct drbd_conf *mdev, struct drbd_work *w, int cancel) { struct drbd_request *req = container_of(w, struct drbd_request, w); - struct drbd_conf *mdev = w->mdev; if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG) - drbd_al_begin_io(mdev, &req->i); + drbd_al_begin_io(mdev, req->sector); + /* Calling drbd_al_begin_io() out of the worker might deadlocks + theoretically. Practically it can not deadlock, since this is + only used when unfreezing IOs. All the extents of the requests + that made it into the TL are already active */ drbd_req_make_private_bio(req, req->master_bio); req->private_bio->bi_bdev = mdev->ldev->backing_bdev; generic_make_request(req->private_bio); - return 0; + return 1; } static int _drbd_may_sync_now(struct drbd_conf *mdev) { struct drbd_conf *odev = mdev; - int resync_after; while (1) { - if (!odev->ldev) - return 1; - rcu_read_lock(); - resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after; - rcu_read_unlock(); - if (resync_after == -1) - return 1; - odev = minor_to_mdev(resync_after); - if (!expect(odev)) + if (odev->sync_conf.after == -1) return 1; + odev = minor_to_mdev(odev->sync_conf.after); + ERR_IF(!odev) return 1; if ((odev->state.conn >= C_SYNC_SOURCE && odev->state.conn <= C_PAUSED_SYNC_T) || odev->state.aftr_isp || odev->state.peer_isp || @@ -1454,15 +1375,16 @@ static int _drbd_pause_after(struct drbd_conf *mdev) struct drbd_conf *odev; int i, rv = 0; - rcu_read_lock(); - idr_for_each_entry(&minors, odev, i) { + for (i = 0; i < minor_count; i++) { + odev = minor_to_mdev(i); + if (!odev) + continue; if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS) continue; if (!_drbd_may_sync_now(odev)) rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL) != SS_NOTHING_TO_DO); } - rcu_read_unlock(); return rv; } @@ -1478,8 +1400,10 @@ static int _drbd_resume_next(struct drbd_conf *mdev) struct drbd_conf *odev; int i, rv = 0; - rcu_read_lock(); - idr_for_each_entry(&minors, odev, i) { + for (i = 0; i < minor_count; i++) { + odev = minor_to_mdev(i); + if (!odev) + continue; if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS) continue; if (odev->state.aftr_isp) { @@ -1489,7 +1413,6 @@ static int _drbd_resume_next(struct drbd_conf *mdev) != SS_NOTHING_TO_DO) ; } } - rcu_read_unlock(); return rv; } @@ -1507,86 +1430,57 @@ void suspend_other_sg(struct drbd_conf *mdev) write_unlock_irq(&global_state_lock); } -/* caller must hold global_state_lock */ -enum drbd_ret_code drbd_resync_after_valid(struct drbd_conf *mdev, int o_minor) +static int sync_after_error(struct drbd_conf *mdev, int o_minor) { struct drbd_conf *odev; - int resync_after; if (o_minor == -1) return NO_ERROR; if (o_minor < -1 || minor_to_mdev(o_minor) == NULL) - return ERR_RESYNC_AFTER; + return ERR_SYNC_AFTER; /* check for loops */ odev = minor_to_mdev(o_minor); while (1) { if (odev == mdev) - return ERR_RESYNC_AFTER_CYCLE; + return ERR_SYNC_AFTER_CYCLE; - rcu_read_lock(); - resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after; - rcu_read_unlock(); /* dependency chain ends here, no cycles. */ - if (resync_after == -1) + if (odev->sync_conf.after == -1) return NO_ERROR; /* follow the dependency chain */ - odev = minor_to_mdev(resync_after); + odev = minor_to_mdev(odev->sync_conf.after); } } -/* caller must hold global_state_lock */ -void drbd_resync_after_changed(struct drbd_conf *mdev) +int drbd_alter_sa(struct drbd_conf *mdev, int na) { int changes; + int retcode; - do { - changes = _drbd_pause_after(mdev); - changes |= _drbd_resume_next(mdev); - } while (changes); + write_lock_irq(&global_state_lock); + retcode = sync_after_error(mdev, na); + if (retcode == NO_ERROR) { + mdev->sync_conf.after = na; + do { + changes = _drbd_pause_after(mdev); + changes |= _drbd_resume_next(mdev); + } while (changes); + } + write_unlock_irq(&global_state_lock); + return retcode; } void drbd_rs_controller_reset(struct drbd_conf *mdev) { - struct fifo_buffer *plan; - atomic_set(&mdev->rs_sect_in, 0); atomic_set(&mdev->rs_sect_ev, 0); mdev->rs_in_flight = 0; - - /* Updating the RCU protected object in place is necessary since - this function gets called from atomic context. - It is valid since all other updates also lead to an completely - empty fifo */ - rcu_read_lock(); - plan = rcu_dereference(mdev->rs_plan_s); - plan->total = 0; - fifo_set(plan, 0); - rcu_read_unlock(); -} - -void start_resync_timer_fn(unsigned long data) -{ - struct drbd_conf *mdev = (struct drbd_conf *) data; - - drbd_queue_work(&mdev->tconn->sender_work, &mdev->start_resync_work); -} - -int w_start_resync(struct drbd_work *w, int cancel) -{ - struct drbd_conf *mdev = w->mdev; - - if (atomic_read(&mdev->unacked_cnt) || atomic_read(&mdev->rs_pending_cnt)) { - dev_warn(DEV, "w_start_resync later...\n"); - mdev->start_resync_timer.expires = jiffies + HZ/10; - add_timer(&mdev->start_resync_timer); - return 0; - } - - drbd_start_resync(mdev, C_SYNC_SOURCE); - clear_bit(AHEAD_TO_SYNC_SOURCE, &mdev->flags); - return 0; + mdev->rs_planed = 0; + spin_lock(&mdev->peer_seq_lock); + fifo_set(&mdev->rs_plan_s, 0); + spin_unlock(&mdev->peer_seq_lock); } /** @@ -1607,58 +1501,43 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) return; } - if (!test_bit(B_RS_H_DONE, &mdev->flags)) { - if (side == C_SYNC_TARGET) { - /* Since application IO was locked out during C_WF_BITMAP_T and - C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET - we check that we might make the data inconsistent. */ - r = drbd_khelper(mdev, "before-resync-target"); - r = (r >> 8) & 0xff; - if (r > 0) { - dev_info(DEV, "before-resync-target handler returned %d, " + if (side == C_SYNC_TARGET) { + /* Since application IO was locked out during C_WF_BITMAP_T and + C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET + we check that we might make the data inconsistent. */ + r = drbd_khelper(mdev, "before-resync-target"); + r = (r >> 8) & 0xff; + if (r > 0) { + dev_info(DEV, "before-resync-target handler returned %d, " + "dropping connection.\n", r); + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); + return; + } + } else /* C_SYNC_SOURCE */ { + r = drbd_khelper(mdev, "before-resync-source"); + r = (r >> 8) & 0xff; + if (r > 0) { + if (r == 3) { + dev_info(DEV, "before-resync-source handler returned %d, " + "ignoring. Old userland tools?", r); + } else { + dev_info(DEV, "before-resync-source handler returned %d, " "dropping connection.\n", r); - conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD); + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); return; } - } else /* C_SYNC_SOURCE */ { - r = drbd_khelper(mdev, "before-resync-source"); - r = (r >> 8) & 0xff; - if (r > 0) { - if (r == 3) { - dev_info(DEV, "before-resync-source handler returned %d, " - "ignoring. Old userland tools?", r); - } else { - dev_info(DEV, "before-resync-source handler returned %d, " - "dropping connection.\n", r); - conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD); - return; - } - } } } - if (current == mdev->tconn->worker.task) { - /* The worker should not sleep waiting for state_mutex, - that can take long */ - if (!mutex_trylock(mdev->state_mutex)) { - set_bit(B_RS_H_DONE, &mdev->flags); - mdev->start_resync_timer.expires = jiffies + HZ/5; - add_timer(&mdev->start_resync_timer); - return; - } - } else { - mutex_lock(mdev->state_mutex); - } - clear_bit(B_RS_H_DONE, &mdev->flags); - + drbd_state_lock(mdev); write_lock_irq(&global_state_lock); if (!get_ldev_if_state(mdev, D_NEGOTIATING)) { write_unlock_irq(&global_state_lock); - mutex_unlock(mdev->state_mutex); + drbd_state_unlock(mdev); return; } - ns = drbd_read_state(mdev); + ns.i = mdev->state.i; ns.aftr_isp = !_drbd_may_sync_now(mdev); @@ -1670,7 +1549,7 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) ns.pdsk = D_INCONSISTENT; r = __drbd_set_state(mdev, ns, CS_VERBOSE, NULL); - ns = drbd_read_state(mdev); + ns = mdev->state; if (ns.conn < C_CONNECTED) r = SS_UNKNOWN_ERROR; @@ -1696,10 +1575,6 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) write_unlock_irq(&global_state_lock); if (r == SS_SUCCESS) { - /* reset rs_last_bcast when a resync or verify is started, - * to deal with potential jiffies wrap. */ - mdev->rs_last_bcast = jiffies - HZ; - dev_info(DEV, "Began resync as %s (will sync %lu KB [%lu bits set]).\n", drbd_conn_str(ns.conn), (unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10), @@ -1714,10 +1589,10 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) * drbd_resync_finished from here in that case. * We drbd_gen_and_send_sync_uuid here for protocol < 96, * and from after_state_ch otherwise. */ - if (side == C_SYNC_SOURCE && mdev->tconn->agreed_pro_version < 96) + if (side == C_SYNC_SOURCE && mdev->agreed_pro_version < 96) drbd_gen_and_send_sync_uuid(mdev); - if (mdev->tconn->agreed_pro_version < 95 && mdev->rs_total == 0) { + if (mdev->agreed_pro_version < 95 && mdev->rs_total == 0) { /* This still has a race (about when exactly the peers * detect connection loss) that can lead to a full sync * on next handshake. In 8.3.9 we fixed this with explicit @@ -1728,16 +1603,10 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) * detect connection loss, then waiting for a ping * response (implicit in drbd_resync_finished) reduces * the race considerably, but does not solve it. */ - if (side == C_SYNC_SOURCE) { - struct net_conf *nc; - int timeo; - - rcu_read_lock(); - nc = rcu_dereference(mdev->tconn->net_conf); - timeo = nc->ping_int * HZ + nc->ping_timeo * HZ / 9; - rcu_read_unlock(); - schedule_timeout_interruptible(timeo); - } + if (side == C_SYNC_SOURCE) + schedule_timeout_interruptible( + mdev->net_conf->ping_int * HZ + + mdev->net_conf->ping_timeo*HZ/9); drbd_resync_finished(mdev); } @@ -1752,180 +1621,114 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) drbd_md_sync(mdev); } put_ldev(mdev); - mutex_unlock(mdev->state_mutex); + drbd_state_unlock(mdev); } -/* If the resource already closed the current epoch, but we did not - * (because we have not yet seen new requests), we should send the - * corresponding barrier now. Must be checked within the same spinlock - * that is used to check for new requests. */ -bool need_to_send_barrier(struct drbd_tconn *connection) -{ - if (!connection->send.seen_any_write_yet) - return false; - - /* Skip barriers that do not contain any writes. - * This may happen during AHEAD mode. */ - if (!connection->send.current_epoch_writes) - return false; - - /* ->req_lock is held when requests are queued on - * connection->sender_work, and put into ->transfer_log. - * It is also held when ->current_tle_nr is increased. - * So either there are already new requests queued, - * and corresponding barriers will be send there. - * Or nothing new is queued yet, so the difference will be 1. - */ - if (atomic_read(&connection->current_tle_nr) != - connection->send.current_epoch_nr + 1) - return false; - - return true; -} - -bool dequeue_work_batch(struct drbd_work_queue *queue, struct list_head *work_list) +int drbd_worker(struct drbd_thread *thi) { - spin_lock_irq(&queue->q_lock); - list_splice_init(&queue->q, work_list); - spin_unlock_irq(&queue->q_lock); - return !list_empty(work_list); -} + struct drbd_conf *mdev = thi->mdev; + struct drbd_work *w = NULL; + LIST_HEAD(work_list); + int intr = 0, i; -bool dequeue_work_item(struct drbd_work_queue *queue, struct list_head *work_list) -{ - spin_lock_irq(&queue->q_lock); - if (!list_empty(&queue->q)) - list_move(queue->q.next, work_list); - spin_unlock_irq(&queue->q_lock); - return !list_empty(work_list); -} + sprintf(current->comm, "drbd%d_worker", mdev_to_minor(mdev)); -void wait_for_work(struct drbd_tconn *connection, struct list_head *work_list) -{ - DEFINE_WAIT(wait); - struct net_conf *nc; - int uncork, cork; + while (get_t_state(thi) == Running) { + drbd_thread_current_set_cpu(mdev); - dequeue_work_item(&connection->sender_work, work_list); - if (!list_empty(work_list)) - return; + if (down_trylock(&mdev->data.work.s)) { + mutex_lock(&mdev->data.mutex); + if (mdev->data.socket && !mdev->net_conf->no_cork) + drbd_tcp_uncork(mdev->data.socket); + mutex_unlock(&mdev->data.mutex); - /* Still nothing to do? - * Maybe we still need to close the current epoch, - * even if no new requests are queued yet. - * - * Also, poke TCP, just in case. - * Then wait for new work (or signal). */ - rcu_read_lock(); - nc = rcu_dereference(connection->net_conf); - uncork = nc ? nc->tcp_cork : 0; - rcu_read_unlock(); - if (uncork) { - mutex_lock(&connection->data.mutex); - if (connection->data.socket) - drbd_tcp_uncork(connection->data.socket); - mutex_unlock(&connection->data.mutex); - } + intr = down_interruptible(&mdev->data.work.s); - for (;;) { - int send_barrier; - prepare_to_wait(&connection->sender_work.q_wait, &wait, TASK_INTERRUPTIBLE); - spin_lock_irq(&connection->req_lock); - spin_lock(&connection->sender_work.q_lock); /* FIXME get rid of this one? */ - /* dequeue single item only, - * we still use drbd_queue_work_front() in some places */ - if (!list_empty(&connection->sender_work.q)) - list_move(connection->sender_work.q.next, work_list); - spin_unlock(&connection->sender_work.q_lock); /* FIXME get rid of this one? */ - if (!list_empty(work_list) || signal_pending(current)) { - spin_unlock_irq(&connection->req_lock); - break; + mutex_lock(&mdev->data.mutex); + if (mdev->data.socket && !mdev->net_conf->no_cork) + drbd_tcp_cork(mdev->data.socket); + mutex_unlock(&mdev->data.mutex); } - send_barrier = need_to_send_barrier(connection); - spin_unlock_irq(&connection->req_lock); - if (send_barrier) { - drbd_send_barrier(connection); - connection->send.current_epoch_nr++; - } - schedule(); - /* may be woken up for other things but new work, too, - * e.g. if the current epoch got closed. - * In which case we send the barrier above. */ - } - finish_wait(&connection->sender_work.q_wait, &wait); - - /* someone may have changed the config while we have been waiting above. */ - rcu_read_lock(); - nc = rcu_dereference(connection->net_conf); - cork = nc ? nc->tcp_cork : 0; - rcu_read_unlock(); - mutex_lock(&connection->data.mutex); - if (connection->data.socket) { - if (cork) - drbd_tcp_cork(connection->data.socket); - else if (!uncork) - drbd_tcp_uncork(connection->data.socket); - } - mutex_unlock(&connection->data.mutex); -} -int drbd_worker(struct drbd_thread *thi) -{ - struct drbd_tconn *tconn = thi->tconn; - struct drbd_work *w = NULL; - struct drbd_conf *mdev; - LIST_HEAD(work_list); - int vnr; - - while (get_t_state(thi) == RUNNING) { - drbd_thread_current_set_cpu(thi); - - /* as long as we use drbd_queue_work_front(), - * we may only dequeue single work items here, not batches. */ - if (list_empty(&work_list)) - wait_for_work(tconn, &work_list); - - if (signal_pending(current)) { + if (intr) { + D_ASSERT(intr == -EINTR); flush_signals(current); - if (get_t_state(thi) == RUNNING) { - conn_warn(tconn, "Worker got an unexpected signal\n"); + ERR_IF (get_t_state(thi) == Running) continue; - } break; } - if (get_t_state(thi) != RUNNING) + if (get_t_state(thi) != Running) break; - - while (!list_empty(&work_list)) { - w = list_first_entry(&work_list, struct drbd_work, list); - list_del_init(&w->list); - if (w->cb(w, tconn->cstate < C_WF_REPORT_PARAMS) == 0) - continue; - if (tconn->cstate >= C_WF_REPORT_PARAMS) - conn_request_state(tconn, NS(conn, C_NETWORK_FAILURE), CS_HARD); + /* With this break, we have done a down() but not consumed + the entry from the list. The cleanup code takes care of + this... */ + + w = NULL; + spin_lock_irq(&mdev->data.work.q_lock); + ERR_IF(list_empty(&mdev->data.work.q)) { + /* something terribly wrong in our logic. + * we were able to down() the semaphore, + * but the list is empty... doh. + * + * what is the best thing to do now? + * try again from scratch, restarting the receiver, + * asender, whatnot? could break even more ugly, + * e.g. when we are primary, but no good local data. + * + * I'll try to get away just starting over this loop. + */ + spin_unlock_irq(&mdev->data.work.q_lock); + continue; + } + w = list_entry(mdev->data.work.q.next, struct drbd_work, list); + list_del_init(&w->list); + spin_unlock_irq(&mdev->data.work.q_lock); + + if (!w->cb(mdev, w, mdev->state.conn < C_CONNECTED)) { + /* dev_warn(DEV, "worker: a callback failed! \n"); */ + if (mdev->state.conn >= C_CONNECTED) + drbd_force_state(mdev, + NS(conn, C_NETWORK_FAILURE)); } } + D_ASSERT(test_bit(DEVICE_DYING, &mdev->flags)); + D_ASSERT(test_bit(CONFIG_PENDING, &mdev->flags)); + + spin_lock_irq(&mdev->data.work.q_lock); + i = 0; + while (!list_empty(&mdev->data.work.q)) { + list_splice_init(&mdev->data.work.q, &work_list); + spin_unlock_irq(&mdev->data.work.q_lock); - do { while (!list_empty(&work_list)) { - w = list_first_entry(&work_list, struct drbd_work, list); + w = list_entry(work_list.next, struct drbd_work, list); list_del_init(&w->list); - w->cb(w, 1); + w->cb(mdev, w, 1); + i++; /* dead debugging code */ } - dequeue_work_batch(&tconn->sender_work, &work_list); - } while (!list_empty(&work_list)); - - rcu_read_lock(); - idr_for_each_entry(&tconn->volumes, mdev, vnr) { - D_ASSERT(mdev->state.disk == D_DISKLESS && mdev->state.conn == C_STANDALONE); - kref_get(&mdev->kref); - rcu_read_unlock(); - drbd_mdev_cleanup(mdev); - kref_put(&mdev->kref, &drbd_minor_destroy); - rcu_read_lock(); + + spin_lock_irq(&mdev->data.work.q_lock); } - rcu_read_unlock(); + sema_init(&mdev->data.work.s, 0); + /* DANGEROUS race: if someone did queue his work within the spinlock, + * but up() ed outside the spinlock, we could get an up() on the + * semaphore without corresponding list entry. + * So don't do that. + */ + spin_unlock_irq(&mdev->data.work.q_lock); + + D_ASSERT(mdev->state.disk == D_DISKLESS && mdev->state.conn == C_STANDALONE); + /* _drbd_set_state only uses stop_nowait. + * wait here for the Exiting receiver. */ + drbd_thread_stop(&mdev->receiver); + drbd_mdev_cleanup(mdev); + + dev_info(DEV, "worker terminated\n"); + + clear_bit(DEVICE_DYING, &mdev->flags); + clear_bit(CONFIG_PENDING, &mdev->flags); + wake_up(&mdev->state_wait); return 0; } diff --git a/trunk/drivers/block/drbd/drbd_wrappers.h b/trunk/drivers/block/drbd/drbd_wrappers.h index 328f18e4b4ee..151f1a37478f 100644 --- a/trunk/drivers/block/drbd/drbd_wrappers.h +++ b/trunk/drivers/block/drbd/drbd_wrappers.h @@ -3,7 +3,6 @@ #include #include -#include "drbd_int.h" /* see get_sb_bdev and bd_claim */ extern char *drbd_sec_holder; @@ -21,8 +20,8 @@ static inline void drbd_set_my_capacity(struct drbd_conf *mdev, /* bi_end_io handlers */ extern void drbd_md_io_complete(struct bio *bio, int error); -extern void drbd_peer_request_endio(struct bio *bio, int error); -extern void drbd_request_endio(struct bio *bio, int error); +extern void drbd_endio_sec(struct bio *bio, int error); +extern void drbd_endio_pri(struct bio *bio, int error); /* * used to submit our private bio @@ -46,6 +45,12 @@ static inline void drbd_generic_make_request(struct drbd_conf *mdev, generic_make_request(bio); } +static inline int drbd_crypto_is_hash(struct crypto_tfm *tfm) +{ + return (crypto_tfm_alg_type(tfm) & CRYPTO_ALG_TYPE_HASH_MASK) + == CRYPTO_ALG_TYPE_HASH; +} + #ifndef __CHECKER__ # undef __cond_lock # define __cond_lock(x,c) (c) diff --git a/trunk/drivers/block/loop.c b/trunk/drivers/block/loop.c index ae1251270624..54046e51160a 100644 --- a/trunk/drivers/block/loop.c +++ b/trunk/drivers/block/loop.c @@ -463,7 +463,6 @@ static int do_bio_filebacked(struct loop_device *lo, struct bio *bio) */ static void loop_add_bio(struct loop_device *lo, struct bio *bio) { - lo->lo_bio_count++; bio_list_add(&lo->lo_bio_list, bio); } @@ -472,7 +471,6 @@ static void loop_add_bio(struct loop_device *lo, struct bio *bio) */ static struct bio *loop_get_bio(struct loop_device *lo) { - lo->lo_bio_count--; return bio_list_pop(&lo->lo_bio_list); } @@ -491,10 +489,6 @@ static void loop_make_request(struct request_queue *q, struct bio *old_bio) goto out; if (unlikely(rw == WRITE && (lo->lo_flags & LO_FLAGS_READ_ONLY))) goto out; - if (lo->lo_bio_count >= q->nr_congestion_on) - wait_event_lock_irq(lo->lo_req_wait, - lo->lo_bio_count < q->nr_congestion_off, - lo->lo_lock); loop_add_bio(lo, old_bio); wake_up(&lo->lo_event); spin_unlock_irq(&lo->lo_lock); @@ -552,8 +546,6 @@ static int loop_thread(void *data) continue; spin_lock_irq(&lo->lo_lock); bio = loop_get_bio(lo); - if (lo->lo_bio_count < lo->lo_queue->nr_congestion_off) - wake_up(&lo->lo_req_wait); spin_unlock_irq(&lo->lo_lock); BUG_ON(!bio); @@ -881,7 +873,6 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, lo->transfer = transfer_none; lo->ioctl = NULL; lo->lo_sizelimit = 0; - lo->lo_bio_count = 0; lo->old_gfp_mask = mapping_gfp_mask(mapping); mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); @@ -1682,7 +1673,6 @@ static int loop_add(struct loop_device **l, int i) lo->lo_number = i; lo->lo_thread = NULL; init_waitqueue_head(&lo->lo_event); - init_waitqueue_head(&lo->lo_req_wait); spin_lock_init(&lo->lo_lock); disk->major = LOOP_MAJOR; disk->first_minor = i << part_shift; diff --git a/trunk/drivers/block/xen-blkback/blkback.c b/trunk/drivers/block/xen-blkback/blkback.c index 74374fb762aa..280a13846e6c 100644 --- a/trunk/drivers/block/xen-blkback/blkback.c +++ b/trunk/drivers/block/xen-blkback/blkback.c @@ -39,7 +39,6 @@ #include #include #include -#include #include #include @@ -80,7 +79,6 @@ struct pending_req { unsigned short operation; int status; struct list_head free_list; - DECLARE_BITMAP(unmap_seg, BLKIF_MAX_SEGMENTS_PER_REQUEST); }; #define BLKBACK_INVALID_HANDLE (~0) @@ -100,36 +98,6 @@ struct xen_blkbk { static struct xen_blkbk *blkbk; -/* - * Maximum number of grant pages that can be mapped in blkback. - * BLKIF_MAX_SEGMENTS_PER_REQUEST * RING_SIZE is the maximum number of - * pages that blkback will persistently map. - * Currently, this is: - * RING_SIZE = 32 (for all known ring types) - * BLKIF_MAX_SEGMENTS_PER_REQUEST = 11 - * sizeof(struct persistent_gnt) = 48 - * So the maximum memory used to store the grants is: - * 32 * 11 * 48 = 16896 bytes - */ -static inline unsigned int max_mapped_grant_pages(enum blkif_protocol protocol) -{ - switch (protocol) { - case BLKIF_PROTOCOL_NATIVE: - return __CONST_RING_SIZE(blkif, PAGE_SIZE) * - BLKIF_MAX_SEGMENTS_PER_REQUEST; - case BLKIF_PROTOCOL_X86_32: - return __CONST_RING_SIZE(blkif_x86_32, PAGE_SIZE) * - BLKIF_MAX_SEGMENTS_PER_REQUEST; - case BLKIF_PROTOCOL_X86_64: - return __CONST_RING_SIZE(blkif_x86_64, PAGE_SIZE) * - BLKIF_MAX_SEGMENTS_PER_REQUEST; - default: - BUG(); - } - return 0; -} - - /* * Little helpful macro to figure out the index and virtual address of the * pending_pages[..]. For each 'pending_req' we have have up to @@ -161,90 +129,6 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, static void make_response(struct xen_blkif *blkif, u64 id, unsigned short op, int st); -#define foreach_grant(pos, rbtree, node) \ - for ((pos) = container_of(rb_first((rbtree)), typeof(*(pos)), node); \ - &(pos)->node != NULL; \ - (pos) = container_of(rb_next(&(pos)->node), typeof(*(pos)), node)) - - -static void add_persistent_gnt(struct rb_root *root, - struct persistent_gnt *persistent_gnt) -{ - struct rb_node **new = &(root->rb_node), *parent = NULL; - struct persistent_gnt *this; - - /* Figure out where to put new node */ - while (*new) { - this = container_of(*new, struct persistent_gnt, node); - - parent = *new; - if (persistent_gnt->gnt < this->gnt) - new = &((*new)->rb_left); - else if (persistent_gnt->gnt > this->gnt) - new = &((*new)->rb_right); - else { - pr_alert(DRV_PFX " trying to add a gref that's already in the tree\n"); - BUG(); - } - } - - /* Add new node and rebalance tree. */ - rb_link_node(&(persistent_gnt->node), parent, new); - rb_insert_color(&(persistent_gnt->node), root); -} - -static struct persistent_gnt *get_persistent_gnt(struct rb_root *root, - grant_ref_t gref) -{ - struct persistent_gnt *data; - struct rb_node *node = root->rb_node; - - while (node) { - data = container_of(node, struct persistent_gnt, node); - - if (gref < data->gnt) - node = node->rb_left; - else if (gref > data->gnt) - node = node->rb_right; - else - return data; - } - return NULL; -} - -static void free_persistent_gnts(struct rb_root *root, unsigned int num) -{ - struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; - struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; - struct persistent_gnt *persistent_gnt; - int ret = 0; - int segs_to_unmap = 0; - - foreach_grant(persistent_gnt, root, node) { - BUG_ON(persistent_gnt->handle == - BLKBACK_INVALID_HANDLE); - gnttab_set_unmap_op(&unmap[segs_to_unmap], - (unsigned long) pfn_to_kaddr(page_to_pfn( - persistent_gnt->page)), - GNTMAP_host_map, - persistent_gnt->handle); - - pages[segs_to_unmap] = persistent_gnt->page; - rb_erase(&persistent_gnt->node, root); - kfree(persistent_gnt); - num--; - - if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST || - !rb_next(&persistent_gnt->node)) { - ret = gnttab_unmap_refs(unmap, NULL, pages, - segs_to_unmap); - BUG_ON(ret); - segs_to_unmap = 0; - } - } - BUG_ON(num != 0); -} - /* * Retrieve from the 'pending_reqs' a free pending_req structure to be used. */ @@ -418,14 +302,6 @@ int xen_blkif_schedule(void *arg) print_stats(blkif); } - /* Free all persistent grant pages */ - if (!RB_EMPTY_ROOT(&blkif->persistent_gnts)) - free_persistent_gnts(&blkif->persistent_gnts, - blkif->persistent_gnt_c); - - BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts)); - blkif->persistent_gnt_c = 0; - if (log_stats) print_stats(blkif); @@ -452,8 +328,6 @@ static void xen_blkbk_unmap(struct pending_req *req) int ret; for (i = 0; i < req->nr_pages; i++) { - if (!test_bit(i, req->unmap_seg)) - continue; handle = pending_handle(req, i); if (handle == BLKBACK_INVALID_HANDLE) continue; @@ -470,26 +344,12 @@ static void xen_blkbk_unmap(struct pending_req *req) static int xen_blkbk_map(struct blkif_request *req, struct pending_req *pending_req, - struct seg_buf seg[], - struct page *pages[]) + struct seg_buf seg[]) { struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST]; - struct persistent_gnt *persistent_gnts[BLKIF_MAX_SEGMENTS_PER_REQUEST]; - struct page *pages_to_gnt[BLKIF_MAX_SEGMENTS_PER_REQUEST]; - struct persistent_gnt *persistent_gnt = NULL; - struct xen_blkif *blkif = pending_req->blkif; - phys_addr_t addr = 0; - int i, j; - bool new_map; + int i; int nseg = req->u.rw.nr_segments; - int segs_to_map = 0; int ret = 0; - int use_persistent_gnts; - - use_persistent_gnts = (blkif->vbd.feature_gnt_persistent); - - BUG_ON(blkif->persistent_gnt_c > - max_mapped_grant_pages(pending_req->blkif->blk_protocol)); /* * Fill out preq.nr_sects with proper amount of sectors, and setup @@ -499,146 +359,36 @@ static int xen_blkbk_map(struct blkif_request *req, for (i = 0; i < nseg; i++) { uint32_t flags; - if (use_persistent_gnts) - persistent_gnt = get_persistent_gnt( - &blkif->persistent_gnts, - req->u.rw.seg[i].gref); - - if (persistent_gnt) { - /* - * We are using persistent grants and - * the grant is already mapped - */ - new_map = false; - } else if (use_persistent_gnts && - blkif->persistent_gnt_c < - max_mapped_grant_pages(blkif->blk_protocol)) { - /* - * We are using persistent grants, the grant is - * not mapped but we have room for it - */ - new_map = true; - persistent_gnt = kmalloc( - sizeof(struct persistent_gnt), - GFP_KERNEL); - if (!persistent_gnt) - return -ENOMEM; - persistent_gnt->page = alloc_page(GFP_KERNEL); - if (!persistent_gnt->page) { - kfree(persistent_gnt); - return -ENOMEM; - } - persistent_gnt->gnt = req->u.rw.seg[i].gref; - persistent_gnt->handle = BLKBACK_INVALID_HANDLE; - - pages_to_gnt[segs_to_map] = - persistent_gnt->page; - addr = (unsigned long) pfn_to_kaddr( - page_to_pfn(persistent_gnt->page)); - - add_persistent_gnt(&blkif->persistent_gnts, - persistent_gnt); - blkif->persistent_gnt_c++; - pr_debug(DRV_PFX " grant %u added to the tree of persistent grants, using %u/%u\n", - persistent_gnt->gnt, blkif->persistent_gnt_c, - max_mapped_grant_pages(blkif->blk_protocol)); - } else { - /* - * We are either using persistent grants and - * hit the maximum limit of grants mapped, - * or we are not using persistent grants. - */ - if (use_persistent_gnts && - !blkif->vbd.overflow_max_grants) { - blkif->vbd.overflow_max_grants = 1; - pr_alert(DRV_PFX " domain %u, device %#x is using maximum number of persistent grants\n", - blkif->domid, blkif->vbd.handle); - } - new_map = true; - pages[i] = blkbk->pending_page(pending_req, i); - addr = vaddr(pending_req, i); - pages_to_gnt[segs_to_map] = - blkbk->pending_page(pending_req, i); - } - - if (persistent_gnt) { - pages[i] = persistent_gnt->page; - persistent_gnts[i] = persistent_gnt; - } else { - persistent_gnts[i] = NULL; - } - - if (new_map) { - flags = GNTMAP_host_map; - if (!persistent_gnt && - (pending_req->operation != BLKIF_OP_READ)) - flags |= GNTMAP_readonly; - gnttab_set_map_op(&map[segs_to_map++], addr, - flags, req->u.rw.seg[i].gref, - blkif->domid); - } + flags = GNTMAP_host_map; + if (pending_req->operation != BLKIF_OP_READ) + flags |= GNTMAP_readonly; + gnttab_set_map_op(&map[i], vaddr(pending_req, i), flags, + req->u.rw.seg[i].gref, + pending_req->blkif->domid); } - if (segs_to_map) { - ret = gnttab_map_refs(map, NULL, pages_to_gnt, segs_to_map); - BUG_ON(ret); - } + ret = gnttab_map_refs(map, NULL, &blkbk->pending_page(pending_req, 0), nseg); + BUG_ON(ret); /* * Now swizzle the MFN in our domain with the MFN from the other domain * so that when we access vaddr(pending_req,i) it has the contents of * the page from the other domain. */ - bitmap_zero(pending_req->unmap_seg, BLKIF_MAX_SEGMENTS_PER_REQUEST); - for (i = 0, j = 0; i < nseg; i++) { - if (!persistent_gnts[i] || - persistent_gnts[i]->handle == BLKBACK_INVALID_HANDLE) { - /* This is a newly mapped grant */ - BUG_ON(j >= segs_to_map); - if (unlikely(map[j].status != 0)) { - pr_debug(DRV_PFX "invalid buffer -- could not remap it\n"); - map[j].handle = BLKBACK_INVALID_HANDLE; - ret |= 1; - if (persistent_gnts[i]) { - rb_erase(&persistent_gnts[i]->node, - &blkif->persistent_gnts); - blkif->persistent_gnt_c--; - kfree(persistent_gnts[i]); - persistent_gnts[i] = NULL; - } - } - } - if (persistent_gnts[i]) { - if (persistent_gnts[i]->handle == - BLKBACK_INVALID_HANDLE) { - /* - * If this is a new persistent grant - * save the handler - */ - persistent_gnts[i]->handle = map[j].handle; - persistent_gnts[i]->dev_bus_addr = - map[j++].dev_bus_addr; - } - pending_handle(pending_req, i) = - persistent_gnts[i]->handle; - - if (ret) - continue; - - seg[i].buf = persistent_gnts[i]->dev_bus_addr | - (req->u.rw.seg[i].first_sect << 9); - } else { - pending_handle(pending_req, i) = map[j].handle; - bitmap_set(pending_req->unmap_seg, i, 1); - - if (ret) { - j++; - continue; - } - - seg[i].buf = map[j++].dev_bus_addr | - (req->u.rw.seg[i].first_sect << 9); + for (i = 0; i < nseg; i++) { + if (unlikely(map[i].status != 0)) { + pr_debug(DRV_PFX "invalid buffer -- could not remap it\n"); + map[i].handle = BLKBACK_INVALID_HANDLE; + ret |= 1; } + + pending_handle(pending_req, i) = map[i].handle; + + if (ret) + continue; + + seg[i].buf = map[i].dev_bus_addr | + (req->u.rw.seg[i].first_sect << 9); } return ret; } @@ -841,7 +591,6 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, int operation; struct blk_plug plug; bool drain = false; - struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; switch (req->operation) { case BLKIF_OP_READ: @@ -928,7 +677,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, * the hypercall to unmap the grants - that is all done in * xen_blkbk_unmap. */ - if (xen_blkbk_map(req, pending_req, seg, pages)) + if (xen_blkbk_map(req, pending_req, seg)) goto fail_flush; /* @@ -940,7 +689,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, for (i = 0; i < nseg; i++) { while ((bio == NULL) || (bio_add_page(bio, - pages[i], + blkbk->pending_page(pending_req, i), seg[i].nsec << 9, seg[i].buf & ~PAGE_MASK) == 0)) { diff --git a/trunk/drivers/block/xen-blkback/common.h b/trunk/drivers/block/xen-blkback/common.h index 6072390c7f57..9a54623e52d7 100644 --- a/trunk/drivers/block/xen-blkback/common.h +++ b/trunk/drivers/block/xen-blkback/common.h @@ -34,7 +34,6 @@ #include #include #include -#include #include #include #include @@ -161,21 +160,10 @@ struct xen_vbd { sector_t size; unsigned int flush_support:1; unsigned int discard_secure:1; - unsigned int feature_gnt_persistent:1; - unsigned int overflow_max_grants:1; }; struct backend_info; - -struct persistent_gnt { - struct page *page; - grant_ref_t gnt; - grant_handle_t handle; - uint64_t dev_bus_addr; - struct rb_node node; -}; - struct xen_blkif { /* Unique identifier for this interface. */ domid_t domid; @@ -202,10 +190,6 @@ struct xen_blkif { struct task_struct *xenblkd; unsigned int waiting_reqs; - /* tree to store persistent grants */ - struct rb_root persistent_gnts; - unsigned int persistent_gnt_c; - /* statistics */ unsigned long st_print; int st_rd_req; diff --git a/trunk/drivers/block/xen-blkback/xenbus.c b/trunk/drivers/block/xen-blkback/xenbus.c index 63980722db41..f58434c2617c 100644 --- a/trunk/drivers/block/xen-blkback/xenbus.c +++ b/trunk/drivers/block/xen-blkback/xenbus.c @@ -117,7 +117,6 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid) atomic_set(&blkif->drain, 0); blkif->st_print = jiffies; init_waitqueue_head(&blkif->waiting_to_free); - blkif->persistent_gnts.rb_node = NULL; return blkif; } @@ -673,13 +672,6 @@ static void connect(struct backend_info *be) xen_blkbk_barrier(xbt, be, be->blkif->vbd.flush_support); - err = xenbus_printf(xbt, dev->nodename, "feature-persistent", "%u", 1); - if (err) { - xenbus_dev_fatal(dev, err, "writing %s/feature-persistent", - dev->nodename); - goto abort; - } - err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu", (unsigned long long)vbd_sz(&be->blkif->vbd)); if (err) { @@ -728,7 +720,6 @@ static int connect_ring(struct backend_info *be) struct xenbus_device *dev = be->dev; unsigned long ring_ref; unsigned int evtchn; - unsigned int pers_grants; char protocol[64] = ""; int err; @@ -758,18 +749,8 @@ static int connect_ring(struct backend_info *be) xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol); return -1; } - err = xenbus_gather(XBT_NIL, dev->otherend, - "feature-persistent", "%u", - &pers_grants, NULL); - if (err) - pers_grants = 0; - - be->blkif->vbd.feature_gnt_persistent = pers_grants; - be->blkif->vbd.overflow_max_grants = 0; - - pr_info(DRV_PFX "ring-ref %ld, event-channel %d, protocol %d (%s) %s\n", - ring_ref, evtchn, be->blkif->blk_protocol, protocol, - pers_grants ? "persistent grants" : ""); + pr_info(DRV_PFX "ring-ref %ld, event-channel %d, protocol %d (%s)\n", + ring_ref, evtchn, be->blkif->blk_protocol, protocol); /* Map the shared frame, irq etc. */ err = xen_blkif_map(be->blkif, ring_ref, evtchn); diff --git a/trunk/drivers/block/xen-blkfront.c b/trunk/drivers/block/xen-blkfront.c index 96e9b00db081..007db8986e84 100644 --- a/trunk/drivers/block/xen-blkfront.c +++ b/trunk/drivers/block/xen-blkfront.c @@ -44,7 +44,6 @@ #include #include #include -#include #include #include @@ -65,17 +64,10 @@ enum blkif_state { BLKIF_STATE_SUSPENDED, }; -struct grant { - grant_ref_t gref; - unsigned long pfn; - struct llist_node node; -}; - struct blk_shadow { struct blkif_request req; struct request *request; unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST]; - struct grant *grants_used[BLKIF_MAX_SEGMENTS_PER_REQUEST]; }; static DEFINE_MUTEX(blkfront_mutex); @@ -105,8 +97,6 @@ struct blkfront_info struct work_struct work; struct gnttab_free_callback callback; struct blk_shadow shadow[BLK_RING_SIZE]; - struct llist_head persistent_gnts; - unsigned int persistent_gnts_c; unsigned long shadow_free; unsigned int feature_flush; unsigned int flush_op; @@ -114,7 +104,6 @@ struct blkfront_info unsigned int feature_secdiscard:1; unsigned int discard_granularity; unsigned int discard_alignment; - unsigned int feature_persistent:1; int is_ready; }; @@ -298,36 +287,21 @@ static int blkif_queue_request(struct request *req) unsigned long id; unsigned int fsect, lsect; int i, ref; - - /* - * Used to store if we are able to queue the request by just using - * existing persistent grants, or if we have to get new grants, - * as there are not sufficiently many free. - */ - bool new_persistent_gnts; grant_ref_t gref_head; - struct page *granted_page; - struct grant *gnt_list_entry = NULL; struct scatterlist *sg; if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) return 1; - /* Check if we have enought grants to allocate a requests */ - if (info->persistent_gnts_c < BLKIF_MAX_SEGMENTS_PER_REQUEST) { - new_persistent_gnts = 1; - if (gnttab_alloc_grant_references( - BLKIF_MAX_SEGMENTS_PER_REQUEST - info->persistent_gnts_c, - &gref_head) < 0) { - gnttab_request_free_callback( - &info->callback, - blkif_restart_queue_callback, - info, - BLKIF_MAX_SEGMENTS_PER_REQUEST); - return 1; - } - } else - new_persistent_gnts = 0; + if (gnttab_alloc_grant_references( + BLKIF_MAX_SEGMENTS_PER_REQUEST, &gref_head) < 0) { + gnttab_request_free_callback( + &info->callback, + blkif_restart_queue_callback, + info, + BLKIF_MAX_SEGMENTS_PER_REQUEST); + return 1; + } /* Fill out a communications ring structure. */ ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt); @@ -367,73 +341,18 @@ static int blkif_queue_request(struct request *req) BLKIF_MAX_SEGMENTS_PER_REQUEST); for_each_sg(info->sg, sg, ring_req->u.rw.nr_segments, i) { + buffer_mfn = pfn_to_mfn(page_to_pfn(sg_page(sg))); fsect = sg->offset >> 9; lsect = fsect + (sg->length >> 9) - 1; + /* install a grant reference. */ + ref = gnttab_claim_grant_reference(&gref_head); + BUG_ON(ref == -ENOSPC); - if (info->persistent_gnts_c) { - BUG_ON(llist_empty(&info->persistent_gnts)); - gnt_list_entry = llist_entry( - llist_del_first(&info->persistent_gnts), - struct grant, node); - - ref = gnt_list_entry->gref; - buffer_mfn = pfn_to_mfn(gnt_list_entry->pfn); - info->persistent_gnts_c--; - } else { - ref = gnttab_claim_grant_reference(&gref_head); - BUG_ON(ref == -ENOSPC); - - gnt_list_entry = - kmalloc(sizeof(struct grant), - GFP_ATOMIC); - if (!gnt_list_entry) - return -ENOMEM; - - granted_page = alloc_page(GFP_ATOMIC); - if (!granted_page) { - kfree(gnt_list_entry); - return -ENOMEM; - } - - gnt_list_entry->pfn = - page_to_pfn(granted_page); - gnt_list_entry->gref = ref; - - buffer_mfn = pfn_to_mfn(page_to_pfn( - granted_page)); - gnttab_grant_foreign_access_ref(ref, + gnttab_grant_foreign_access_ref( + ref, info->xbdev->otherend_id, - buffer_mfn, 0); - } - - info->shadow[id].grants_used[i] = gnt_list_entry; - - if (rq_data_dir(req)) { - char *bvec_data; - void *shared_data; - - BUG_ON(sg->offset + sg->length > PAGE_SIZE); - - shared_data = kmap_atomic( - pfn_to_page(gnt_list_entry->pfn)); - bvec_data = kmap_atomic(sg_page(sg)); - - /* - * this does not wipe data stored outside the - * range sg->offset..sg->offset+sg->length. - * Therefore, blkback *could* see data from - * previous requests. This is OK as long as - * persistent grants are shared with just one - * domain. It may need refactoring if this - * changes - */ - memcpy(shared_data + sg->offset, - bvec_data + sg->offset, - sg->length); - - kunmap_atomic(bvec_data); - kunmap_atomic(shared_data); - } + buffer_mfn, + rq_data_dir(req)); info->shadow[id].frame[i] = mfn_to_pfn(buffer_mfn); ring_req->u.rw.seg[i] = @@ -449,8 +368,7 @@ static int blkif_queue_request(struct request *req) /* Keep a private copy so we can reissue requests when recovering. */ info->shadow[id].req = *ring_req; - if (new_persistent_gnts) - gnttab_free_grant_references(gref_head); + gnttab_free_grant_references(gref_head); return 0; } @@ -562,13 +480,12 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size) static void xlvbd_flush(struct blkfront_info *info) { blk_queue_flush(info->rq, info->feature_flush); - printk(KERN_INFO "blkfront: %s: %s: %s %s\n", + printk(KERN_INFO "blkfront: %s: %s: %s\n", info->gd->disk_name, info->flush_op == BLKIF_OP_WRITE_BARRIER ? "barrier" : (info->flush_op == BLKIF_OP_FLUSH_DISKCACHE ? "flush diskcache" : "barrier or flush"), - info->feature_flush ? "enabled" : "disabled", - info->feature_persistent ? "using persistent grants" : ""); + info->feature_flush ? "enabled" : "disabled"); } static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset) @@ -790,9 +707,6 @@ static void blkif_restart_queue(struct work_struct *work) static void blkif_free(struct blkfront_info *info, int suspend) { - struct llist_node *all_gnts; - struct grant *persistent_gnt; - /* Prevent new requests being issued until we fix things up. */ spin_lock_irq(&info->io_lock); info->connected = suspend ? @@ -800,18 +714,6 @@ static void blkif_free(struct blkfront_info *info, int suspend) /* No more blkif_request(). */ if (info->rq) blk_stop_queue(info->rq); - - /* Remove all persistent grants */ - if (info->persistent_gnts_c) { - all_gnts = llist_del_all(&info->persistent_gnts); - llist_for_each_entry(persistent_gnt, all_gnts, node) { - gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL); - __free_page(pfn_to_page(persistent_gnt->pfn)); - kfree(persistent_gnt); - } - info->persistent_gnts_c = 0; - } - /* No more gnttab callback work. */ gnttab_cancel_free_callback(&info->callback); spin_unlock_irq(&info->io_lock); @@ -832,43 +734,13 @@ static void blkif_free(struct blkfront_info *info, int suspend) } -static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, - struct blkif_response *bret) +static void blkif_completion(struct blk_shadow *s) { int i; - struct bio_vec *bvec; - struct req_iterator iter; - unsigned long flags; - char *bvec_data; - void *shared_data; - unsigned int offset = 0; - - if (bret->operation == BLKIF_OP_READ) { - /* - * Copy the data received from the backend into the bvec. - * Since bv_offset can be different than 0, and bv_len different - * than PAGE_SIZE, we have to keep track of the current offset, - * to be sure we are copying the data from the right shared page. - */ - rq_for_each_segment(bvec, s->request, iter) { - BUG_ON((bvec->bv_offset + bvec->bv_len) > PAGE_SIZE); - i = offset >> PAGE_SHIFT; - BUG_ON(i >= s->req.u.rw.nr_segments); - shared_data = kmap_atomic( - pfn_to_page(s->grants_used[i]->pfn)); - bvec_data = bvec_kmap_irq(bvec, &flags); - memcpy(bvec_data, shared_data + bvec->bv_offset, - bvec->bv_len); - bvec_kunmap_irq(bvec_data, &flags); - kunmap_atomic(shared_data); - offset += bvec->bv_len; - } - } - /* Add the persistent grant into the list of free grants */ - for (i = 0; i < s->req.u.rw.nr_segments; i++) { - llist_add(&s->grants_used[i]->node, &info->persistent_gnts); - info->persistent_gnts_c++; - } + /* Do not let BLKIF_OP_DISCARD as nr_segment is in the same place + * flag. */ + for (i = 0; i < s->req.u.rw.nr_segments; i++) + gnttab_end_foreign_access(s->req.u.rw.seg[i].gref, 0, 0UL); } static irqreturn_t blkif_interrupt(int irq, void *dev_id) @@ -911,7 +783,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) req = info->shadow[id].request; if (bret->operation != BLKIF_OP_DISCARD) - blkif_completion(&info->shadow[id], info, bret); + blkif_completion(&info->shadow[id]); if (add_id_to_freelist(info, id)) { WARN(1, "%s: response to %s (id %ld) couldn't be recycled!\n", @@ -1070,11 +942,6 @@ static int talk_to_blkback(struct xenbus_device *dev, message = "writing protocol"; goto abort_transaction; } - err = xenbus_printf(xbt, dev->nodename, - "feature-persistent", "%u", 1); - if (err) - dev_warn(&dev->dev, - "writing persistent grants feature to xenbus"); err = xenbus_transaction_end(xbt, 0); if (err) { @@ -1162,8 +1029,6 @@ static int blkfront_probe(struct xenbus_device *dev, spin_lock_init(&info->io_lock); info->xbdev = dev; info->vdevice = vdevice; - init_llist_head(&info->persistent_gnts); - info->persistent_gnts_c = 0; info->connected = BLKIF_STATE_DISCONNECTED; INIT_WORK(&info->work, blkif_restart_queue); @@ -1228,7 +1093,7 @@ static int blkif_recover(struct blkfront_info *info) req->u.rw.seg[j].gref, info->xbdev->otherend_id, pfn_to_mfn(info->shadow[req->u.rw.id].frame[j]), - 0); + rq_data_dir(info->shadow[req->u.rw.id].request)); } info->shadow[req->u.rw.id].req = *req; @@ -1360,7 +1225,7 @@ static void blkfront_connect(struct blkfront_info *info) unsigned long sector_size; unsigned int binfo; int err; - int barrier, flush, discard, persistent; + int barrier, flush, discard; switch (info->connected) { case BLKIF_STATE_CONNECTED: @@ -1438,14 +1303,6 @@ static void blkfront_connect(struct blkfront_info *info) if (!err && discard) blkfront_setup_discard(info); - err = xenbus_gather(XBT_NIL, info->xbdev->otherend, - "feature-persistent", "%u", &persistent, - NULL); - if (err) - info->feature_persistent = 0; - else - info->feature_persistent = persistent; - err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size); if (err) { xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s", diff --git a/trunk/drivers/firmware/efivars.c b/trunk/drivers/firmware/efivars.c index 7b1c37497c9a..52c5d8956d7d 100644 --- a/trunk/drivers/firmware/efivars.c +++ b/trunk/drivers/firmware/efivars.c @@ -883,6 +883,7 @@ static struct inode *efivarfs_get_inode(struct super_block *sb, if (inode) { inode->i_ino = get_next_ino(); + inode->i_uid = inode->i_gid = 0; inode->i_mode = mode; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; switch (mode & S_IFMT) { diff --git a/trunk/drivers/md/md.c b/trunk/drivers/md/md.c index 4843b004c558..bd8bf0953fe3 100644 --- a/trunk/drivers/md/md.c +++ b/trunk/drivers/md/md.c @@ -452,7 +452,7 @@ void md_flush_request(struct mddev *mddev, struct bio *bio) spin_lock_irq(&mddev->write_lock); wait_event_lock_irq(mddev->sb_wait, !mddev->flush_bio, - mddev->write_lock); + mddev->write_lock, /*nothing*/); mddev->flush_bio = bio; spin_unlock_irq(&mddev->write_lock); diff --git a/trunk/drivers/md/md.h b/trunk/drivers/md/md.h index 1e2fc3d9c74c..af443ab868db 100644 --- a/trunk/drivers/md/md.h +++ b/trunk/drivers/md/md.h @@ -551,6 +551,32 @@ struct md_thread { #define THREAD_WAKEUP 0 +#define __wait_event_lock_irq(wq, condition, lock, cmd) \ +do { \ + wait_queue_t __wait; \ + init_waitqueue_entry(&__wait, current); \ + \ + add_wait_queue(&wq, &__wait); \ + for (;;) { \ + set_current_state(TASK_UNINTERRUPTIBLE); \ + if (condition) \ + break; \ + spin_unlock_irq(&lock); \ + cmd; \ + schedule(); \ + spin_lock_irq(&lock); \ + } \ + current->state = TASK_RUNNING; \ + remove_wait_queue(&wq, &__wait); \ +} while (0) + +#define wait_event_lock_irq(wq, condition, lock, cmd) \ +do { \ + if (condition) \ + break; \ + __wait_event_lock_irq(wq, condition, lock, cmd); \ +} while (0) + static inline void safe_put_page(struct page *p) { if (p) put_page(p); diff --git a/trunk/drivers/md/raid1.c b/trunk/drivers/md/raid1.c index d5bddfc4010e..a0f73092176e 100644 --- a/trunk/drivers/md/raid1.c +++ b/trunk/drivers/md/raid1.c @@ -822,7 +822,7 @@ static void raise_barrier(struct r1conf *conf) /* Wait until no block IO is waiting */ wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting, - conf->resync_lock); + conf->resync_lock, ); /* block any new IO from starting */ conf->barrier++; @@ -830,7 +830,7 @@ static void raise_barrier(struct r1conf *conf) /* Now wait for all pending IO to complete */ wait_event_lock_irq(conf->wait_barrier, !conf->nr_pending && conf->barrier < RESYNC_DEPTH, - conf->resync_lock); + conf->resync_lock, ); spin_unlock_irq(&conf->resync_lock); } @@ -864,7 +864,8 @@ static void wait_barrier(struct r1conf *conf) (conf->nr_pending && current->bio_list && !bio_list_empty(current->bio_list)), - conf->resync_lock); + conf->resync_lock, + ); conf->nr_waiting--; } conf->nr_pending++; @@ -897,10 +898,10 @@ static void freeze_array(struct r1conf *conf) spin_lock_irq(&conf->resync_lock); conf->barrier++; conf->nr_waiting++; - wait_event_lock_irq_cmd(conf->wait_barrier, - conf->nr_pending == conf->nr_queued+1, - conf->resync_lock, - flush_pending_writes(conf)); + wait_event_lock_irq(conf->wait_barrier, + conf->nr_pending == conf->nr_queued+1, + conf->resync_lock, + flush_pending_writes(conf)); spin_unlock_irq(&conf->resync_lock); } static void unfreeze_array(struct r1conf *conf) diff --git a/trunk/drivers/md/raid10.c b/trunk/drivers/md/raid10.c index 64d48249c03b..c9acbd717131 100644 --- a/trunk/drivers/md/raid10.c +++ b/trunk/drivers/md/raid10.c @@ -952,7 +952,7 @@ static void raise_barrier(struct r10conf *conf, int force) /* Wait until no block IO is waiting (unless 'force') */ wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting, - conf->resync_lock); + conf->resync_lock, ); /* block any new IO from starting */ conf->barrier++; @@ -960,7 +960,7 @@ static void raise_barrier(struct r10conf *conf, int force) /* Now wait for all pending IO to complete */ wait_event_lock_irq(conf->wait_barrier, !conf->nr_pending && conf->barrier < RESYNC_DEPTH, - conf->resync_lock); + conf->resync_lock, ); spin_unlock_irq(&conf->resync_lock); } @@ -993,7 +993,8 @@ static void wait_barrier(struct r10conf *conf) (conf->nr_pending && current->bio_list && !bio_list_empty(current->bio_list)), - conf->resync_lock); + conf->resync_lock, + ); conf->nr_waiting--; } conf->nr_pending++; @@ -1026,10 +1027,10 @@ static void freeze_array(struct r10conf *conf) spin_lock_irq(&conf->resync_lock); conf->barrier++; conf->nr_waiting++; - wait_event_lock_irq_cmd(conf->wait_barrier, - conf->nr_pending == conf->nr_queued+1, - conf->resync_lock, - flush_pending_writes(conf)); + wait_event_lock_irq(conf->wait_barrier, + conf->nr_pending == conf->nr_queued+1, + conf->resync_lock, + flush_pending_writes(conf)); spin_unlock_irq(&conf->resync_lock); } diff --git a/trunk/drivers/md/raid5.c b/trunk/drivers/md/raid5.c index 8d8555bf3e1d..3380372c0393 100644 --- a/trunk/drivers/md/raid5.c +++ b/trunk/drivers/md/raid5.c @@ -466,7 +466,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector, do { wait_event_lock_irq(conf->wait_for_stripe, conf->quiesce == 0 || noquiesce, - conf->device_lock); + conf->device_lock, /* nothing */); sh = __find_stripe(conf, sector, conf->generation - previous); if (!sh) { if (!conf->inactive_blocked) @@ -480,7 +480,8 @@ get_active_stripe(struct r5conf *conf, sector_t sector, (atomic_read(&conf->active_stripes) < (conf->max_nr_stripes *3/4) || !conf->inactive_blocked), - conf->device_lock); + conf->device_lock, + ); conf->inactive_blocked = 0; } else init_stripe(sh, sector, previous); @@ -1645,7 +1646,8 @@ static int resize_stripes(struct r5conf *conf, int newsize) spin_lock_irq(&conf->device_lock); wait_event_lock_irq(conf->wait_for_stripe, !list_empty(&conf->inactive_list), - conf->device_lock); + conf->device_lock, + ); osh = get_free_stripe(conf); spin_unlock_irq(&conf->device_lock); atomic_set(&nsh->count, 1); @@ -4001,7 +4003,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) spin_lock_irq(&conf->device_lock); wait_event_lock_irq(conf->wait_for_stripe, conf->quiesce == 0, - conf->device_lock); + conf->device_lock, /* nothing */); atomic_inc(&conf->active_aligned_reads); spin_unlock_irq(&conf->device_lock); @@ -6093,7 +6095,7 @@ static void raid5_quiesce(struct mddev *mddev, int state) wait_event_lock_irq(conf->wait_for_stripe, atomic_read(&conf->active_stripes) == 0 && atomic_read(&conf->active_aligned_reads) == 0, - conf->device_lock); + conf->device_lock, /* nothing */); conf->quiesce = 1; spin_unlock_irq(&conf->device_lock); /* allow reshape to continue */ diff --git a/trunk/drivers/staging/android/binder.c b/trunk/drivers/staging/android/binder.c index 2d12e8a1f82e..4a36e9ab8cf7 100644 --- a/trunk/drivers/staging/android/binder.c +++ b/trunk/drivers/staging/android/binder.c @@ -35,7 +35,6 @@ #include #include #include -#include #include "binder.h" #include "binder_trace.h" @@ -2321,7 +2320,7 @@ static int binder_thread_read(struct binder_proc *proc, if (t->from) { struct task_struct *sender = t->from->proc->tsk; tr.sender_pid = task_tgid_nr_ns(sender, - task_active_pid_ns(current)); + current->nsproxy->pid_ns); } else { tr.sender_pid = 0; } diff --git a/trunk/fs/attr.c b/trunk/fs/attr.c index 1449adb14ef6..cce7df53b694 100644 --- a/trunk/fs/attr.c +++ b/trunk/fs/attr.c @@ -49,15 +49,14 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr) /* Make sure a caller can chown. */ if ((ia_valid & ATTR_UID) && (!uid_eq(current_fsuid(), inode->i_uid) || - !uid_eq(attr->ia_uid, inode->i_uid)) && - !inode_capable(inode, CAP_CHOWN)) + !uid_eq(attr->ia_uid, inode->i_uid)) && !capable(CAP_CHOWN)) return -EPERM; /* Make sure caller can chgrp. */ if ((ia_valid & ATTR_GID) && (!uid_eq(current_fsuid(), inode->i_uid) || (!in_group_p(attr->ia_gid) && !gid_eq(attr->ia_gid, inode->i_gid))) && - !inode_capable(inode, CAP_CHOWN)) + !capable(CAP_CHOWN)) return -EPERM; /* Make sure a caller can chmod. */ @@ -66,8 +65,7 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr) return -EPERM; /* Also check the setgid bit! */ if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid : - inode->i_gid) && - !inode_capable(inode, CAP_FSETID)) + inode->i_gid) && !capable(CAP_FSETID)) attr->ia_mode &= ~S_ISGID; } @@ -159,8 +157,7 @@ void setattr_copy(struct inode *inode, const struct iattr *attr) if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; - if (!in_group_p(inode->i_gid) && - !inode_capable(inode, CAP_FSETID)) + if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) mode &= ~S_ISGID; inode->i_mode = mode; } diff --git a/trunk/fs/autofs4/autofs_i.h b/trunk/fs/autofs4/autofs_i.h index b785e7707959..908e18455413 100644 --- a/trunk/fs/autofs4/autofs_i.h +++ b/trunk/fs/autofs4/autofs_i.h @@ -74,8 +74,8 @@ struct autofs_info { unsigned long last_used; atomic_t count; - kuid_t uid; - kgid_t gid; + uid_t uid; + gid_t gid; }; #define AUTOFS_INF_EXPIRING (1<<0) /* dentry is in the process of expiring */ @@ -89,8 +89,8 @@ struct autofs_wait_queue { struct qstr name; u32 dev; u64 ino; - kuid_t uid; - kgid_t gid; + uid_t uid; + gid_t gid; pid_t pid; pid_t tgid; /* This is for status reporting upon return */ diff --git a/trunk/fs/autofs4/dev-ioctl.c b/trunk/fs/autofs4/dev-ioctl.c index 9f68a37bb2b2..a16214109d31 100644 --- a/trunk/fs/autofs4/dev-ioctl.c +++ b/trunk/fs/autofs4/dev-ioctl.c @@ -437,8 +437,8 @@ static int autofs_dev_ioctl_requester(struct file *fp, err = 0; autofs4_expire_wait(path.dentry); spin_lock(&sbi->fs_lock); - param->requester.uid = from_kuid_munged(current_user_ns(), ino->uid); - param->requester.gid = from_kgid_munged(current_user_ns(), ino->gid); + param->requester.uid = ino->uid; + param->requester.gid = ino->gid; spin_unlock(&sbi->fs_lock); } path_put(&path); diff --git a/trunk/fs/autofs4/inode.c b/trunk/fs/autofs4/inode.c index b104726e2d0a..8a4fed8ead30 100644 --- a/trunk/fs/autofs4/inode.c +++ b/trunk/fs/autofs4/inode.c @@ -36,8 +36,8 @@ struct autofs_info *autofs4_new_ino(struct autofs_sb_info *sbi) void autofs4_clean_ino(struct autofs_info *ino) { - ino->uid = GLOBAL_ROOT_UID; - ino->gid = GLOBAL_ROOT_GID; + ino->uid = 0; + ino->gid = 0; ino->last_used = jiffies; } @@ -79,12 +79,10 @@ static int autofs4_show_options(struct seq_file *m, struct dentry *root) return 0; seq_printf(m, ",fd=%d", sbi->pipefd); - if (!uid_eq(root_inode->i_uid, GLOBAL_ROOT_UID)) - seq_printf(m, ",uid=%u", - from_kuid_munged(&init_user_ns, root_inode->i_uid)); - if (!gid_eq(root_inode->i_gid, GLOBAL_ROOT_GID)) - seq_printf(m, ",gid=%u", - from_kgid_munged(&init_user_ns, root_inode->i_gid)); + if (root_inode->i_uid != 0) + seq_printf(m, ",uid=%u", root_inode->i_uid); + if (root_inode->i_gid != 0) + seq_printf(m, ",gid=%u", root_inode->i_gid); seq_printf(m, ",pgrp=%d", sbi->oz_pgrp); seq_printf(m, ",timeout=%lu", sbi->exp_timeout/HZ); seq_printf(m, ",minproto=%d", sbi->min_proto); @@ -128,7 +126,7 @@ static const match_table_t tokens = { {Opt_err, NULL} }; -static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid, +static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid, pid_t *pgrp, unsigned int *type, int *minproto, int *maxproto) { char *p; @@ -161,16 +159,12 @@ static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid, case Opt_uid: if (match_int(args, &option)) return 1; - *uid = make_kuid(current_user_ns(), option); - if (!uid_valid(*uid)) - return 1; + *uid = option; break; case Opt_gid: if (match_int(args, &option)) return 1; - *gid = make_kgid(current_user_ns(), option); - if (!gid_valid(*gid)) - return 1; + *gid = option; break; case Opt_pgrp: if (match_int(args, &option)) diff --git a/trunk/fs/autofs4/waitq.c b/trunk/fs/autofs4/waitq.c index 03bc1d347d8e..dce436e595c1 100644 --- a/trunk/fs/autofs4/waitq.c +++ b/trunk/fs/autofs4/waitq.c @@ -154,7 +154,6 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, case autofs_ptype_expire_direct: { struct autofs_v5_packet *packet = &pkt.v5_pkt.v5_packet; - struct user_namespace *user_ns = sbi->pipe->f_cred->user_ns; pktsz = sizeof(*packet); @@ -164,8 +163,8 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, packet->name[wq->name.len] = '\0'; packet->dev = wq->dev; packet->ino = wq->ino; - packet->uid = from_kuid_munged(user_ns, wq->uid); - packet->gid = from_kgid_munged(user_ns, wq->gid); + packet->uid = wq->uid; + packet->gid = wq->gid; packet->pid = wq->pid; packet->tgid = wq->tgid; break; diff --git a/trunk/fs/exec.c b/trunk/fs/exec.c index b71b08ce7120..721a29929511 100644 --- a/trunk/fs/exec.c +++ b/trunk/fs/exec.c @@ -1266,13 +1266,14 @@ int prepare_binprm(struct linux_binprm *bprm) bprm->cred->egid = current_egid(); if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) && - !current->no_new_privs && - kuid_has_mapping(bprm->cred->user_ns, inode->i_uid) && - kgid_has_mapping(bprm->cred->user_ns, inode->i_gid)) { + !current->no_new_privs) { /* Set-uid? */ if (mode & S_ISUID) { + if (!kuid_has_mapping(bprm->cred->user_ns, inode->i_uid)) + return -EPERM; bprm->per_clear |= PER_CLEAR_ON_SETID; bprm->cred->euid = inode->i_uid; + } /* Set-gid? */ @@ -1282,6 +1283,8 @@ int prepare_binprm(struct linux_binprm *bprm) * executable. */ if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { + if (!kgid_has_mapping(bprm->cred->user_ns, inode->i_gid)) + return -EPERM; bprm->per_clear |= PER_CLEAR_ON_SETID; bprm->cred->egid = inode->i_gid; } diff --git a/trunk/fs/fuse/dev.c b/trunk/fs/fuse/dev.c index c16335315e5d..8c23fa7a91e6 100644 --- a/trunk/fs/fuse/dev.c +++ b/trunk/fs/fuse/dev.c @@ -92,8 +92,8 @@ static void __fuse_put_request(struct fuse_req *req) static void fuse_req_init_context(struct fuse_req *req) { - req->in.h.uid = from_kuid_munged(&init_user_ns, current_fsuid()); - req->in.h.gid = from_kgid_munged(&init_user_ns, current_fsgid()); + req->in.h.uid = current_fsuid(); + req->in.h.gid = current_fsgid(); req->in.h.pid = current->pid; } diff --git a/trunk/fs/fuse/dir.c b/trunk/fs/fuse/dir.c index b7c09f9eb40c..324bc0850534 100644 --- a/trunk/fs/fuse/dir.c +++ b/trunk/fs/fuse/dir.c @@ -818,8 +818,8 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr, stat->ino = attr->ino; stat->mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); stat->nlink = attr->nlink; - stat->uid = make_kuid(&init_user_ns, attr->uid); - stat->gid = make_kgid(&init_user_ns, attr->gid); + stat->uid = attr->uid; + stat->gid = attr->gid; stat->rdev = inode->i_rdev; stat->atime.tv_sec = attr->atime; stat->atime.tv_nsec = attr->atimensec; @@ -1007,12 +1007,12 @@ int fuse_allow_task(struct fuse_conn *fc, struct task_struct *task) rcu_read_lock(); ret = 0; cred = __task_cred(task); - if (uid_eq(cred->euid, fc->user_id) && - uid_eq(cred->suid, fc->user_id) && - uid_eq(cred->uid, fc->user_id) && - gid_eq(cred->egid, fc->group_id) && - gid_eq(cred->sgid, fc->group_id) && - gid_eq(cred->gid, fc->group_id)) + if (cred->euid == fc->user_id && + cred->suid == fc->user_id && + cred->uid == fc->user_id && + cred->egid == fc->group_id && + cred->sgid == fc->group_id && + cred->gid == fc->group_id) ret = 1; rcu_read_unlock(); @@ -1306,9 +1306,9 @@ static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg) if (ivalid & ATTR_MODE) arg->valid |= FATTR_MODE, arg->mode = iattr->ia_mode; if (ivalid & ATTR_UID) - arg->valid |= FATTR_UID, arg->uid = from_kuid(&init_user_ns, iattr->ia_uid); + arg->valid |= FATTR_UID, arg->uid = iattr->ia_uid; if (ivalid & ATTR_GID) - arg->valid |= FATTR_GID, arg->gid = from_kgid(&init_user_ns, iattr->ia_gid); + arg->valid |= FATTR_GID, arg->gid = iattr->ia_gid; if (ivalid & ATTR_SIZE) arg->valid |= FATTR_SIZE, arg->size = iattr->ia_size; if (ivalid & ATTR_ATIME) { diff --git a/trunk/fs/fuse/fuse_i.h b/trunk/fs/fuse/fuse_i.h index e105a53fc72d..e24dd74e3068 100644 --- a/trunk/fs/fuse/fuse_i.h +++ b/trunk/fs/fuse/fuse_i.h @@ -333,10 +333,10 @@ struct fuse_conn { atomic_t count; /** The user id for this mount */ - kuid_t user_id; + uid_t user_id; /** The group id for this mount */ - kgid_t group_id; + gid_t group_id; /** The fuse mount flags for this mount */ unsigned flags; diff --git a/trunk/fs/fuse/inode.c b/trunk/fs/fuse/inode.c index 73ca6b72beaf..f0eda124cffb 100644 --- a/trunk/fs/fuse/inode.c +++ b/trunk/fs/fuse/inode.c @@ -60,8 +60,8 @@ MODULE_PARM_DESC(max_user_congthresh, struct fuse_mount_data { int fd; unsigned rootmode; - kuid_t user_id; - kgid_t group_id; + unsigned user_id; + unsigned group_id; unsigned fd_present:1; unsigned rootmode_present:1; unsigned user_id_present:1; @@ -164,8 +164,8 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, inode->i_ino = fuse_squash_ino(attr->ino); inode->i_mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); set_nlink(inode, attr->nlink); - inode->i_uid = make_kuid(&init_user_ns, attr->uid); - inode->i_gid = make_kgid(&init_user_ns, attr->gid); + inode->i_uid = attr->uid; + inode->i_gid = attr->gid; inode->i_blocks = attr->blocks; inode->i_atime.tv_sec = attr->atime; inode->i_atime.tv_nsec = attr->atimensec; @@ -492,18 +492,14 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev) case OPT_USER_ID: if (match_int(&args[0], &value)) return 0; - d->user_id = make_kuid(current_user_ns(), value); - if (!uid_valid(d->user_id)) - return 0; + d->user_id = value; d->user_id_present = 1; break; case OPT_GROUP_ID: if (match_int(&args[0], &value)) return 0; - d->group_id = make_kgid(current_user_ns(), value); - if (!gid_valid(d->group_id)) - return 0; + d->group_id = value; d->group_id_present = 1; break; @@ -544,8 +540,8 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root) struct super_block *sb = root->d_sb; struct fuse_conn *fc = get_fuse_conn_super(sb); - seq_printf(m, ",user_id=%u", from_kuid_munged(&init_user_ns, fc->user_id)); - seq_printf(m, ",group_id=%u", from_kgid_munged(&init_user_ns, fc->group_id)); + seq_printf(m, ",user_id=%u", fc->user_id); + seq_printf(m, ",group_id=%u", fc->group_id); if (fc->flags & FUSE_DEFAULT_PERMISSIONS) seq_puts(m, ",default_permissions"); if (fc->flags & FUSE_ALLOW_OTHER) @@ -993,8 +989,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) if (!file) goto err; - if ((file->f_op != &fuse_dev_operations) || - (file->f_cred->user_ns != &init_user_ns)) + if (file->f_op != &fuse_dev_operations) goto err_fput; fc = kmalloc(sizeof(*fc), GFP_KERNEL); diff --git a/trunk/fs/hppfs/hppfs.c b/trunk/fs/hppfs/hppfs.c index 43b315f2002b..78f21f8dc2ec 100644 --- a/trunk/fs/hppfs/hppfs.c +++ b/trunk/fs/hppfs/hppfs.c @@ -710,7 +710,7 @@ static int hppfs_fill_super(struct super_block *sb, void *d, int silent) struct vfsmount *proc_mnt; int err = -ENOENT; - proc_mnt = mntget(task_active_pid_ns(current)->proc_mnt); + proc_mnt = mntget(current->nsproxy->pid_ns->proc_mnt); if (IS_ERR(proc_mnt)) goto out; diff --git a/trunk/fs/mount.h b/trunk/fs/mount.h index cd5007980400..4f291f9de641 100644 --- a/trunk/fs/mount.h +++ b/trunk/fs/mount.h @@ -4,11 +4,8 @@ struct mnt_namespace { atomic_t count; - unsigned int proc_inum; struct mount * root; struct list_head list; - struct user_namespace *user_ns; - u64 seq; /* Sequence number to prevent loops */ wait_queue_head_t poll; int event; }; diff --git a/trunk/fs/namespace.c b/trunk/fs/namespace.c index c1bbe86f4920..24960626bb6b 100644 --- a/trunk/fs/namespace.c +++ b/trunk/fs/namespace.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include @@ -21,7 +20,6 @@ #include /* get_fs_root et.al. */ #include /* fsnotify_vfsmount_delete */ #include -#include #include "pnode.h" #include "internal.h" @@ -786,7 +784,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, if (!mnt) return ERR_PTR(-ENOMEM); - if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE)) + if (flag & (CL_SLAVE | CL_PRIVATE)) mnt->mnt_group_id = 0; /* not a peer of original */ else mnt->mnt_group_id = old->mnt_group_id; @@ -807,8 +805,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, list_add_tail(&mnt->mnt_instance, &sb->s_mounts); br_write_unlock(&vfsmount_lock); - if ((flag & CL_SLAVE) || - ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) { + if (flag & CL_SLAVE) { list_add(&mnt->mnt_slave, &old->mnt_slave_list); mnt->mnt_master = old; CLEAR_MNT_SHARED(mnt); @@ -1269,7 +1266,7 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags) goto dput_and_out; retval = -EPERM; - if (!ns_capable(mnt->mnt_ns->user_ns, CAP_SYS_ADMIN)) + if (!capable(CAP_SYS_ADMIN)) goto dput_and_out; retval = do_umount(mnt, flags); @@ -1295,7 +1292,7 @@ SYSCALL_DEFINE1(oldumount, char __user *, name) static int mount_is_safe(struct path *path) { - if (ns_capable(real_mount(path->mnt)->mnt_ns->user_ns, CAP_SYS_ADMIN)) + if (capable(CAP_SYS_ADMIN)) return 0; return -EPERM; #ifdef notyet @@ -1311,26 +1308,6 @@ static int mount_is_safe(struct path *path) #endif } -static bool mnt_ns_loop(struct path *path) -{ - /* Could bind mounting the mount namespace inode cause a - * mount namespace loop? - */ - struct inode *inode = path->dentry->d_inode; - struct proc_inode *ei; - struct mnt_namespace *mnt_ns; - - if (!proc_ns_inode(inode)) - return false; - - ei = PROC_I(inode); - if (ei->ns_ops != &mntns_operations) - return false; - - mnt_ns = ei->ns; - return current->nsproxy->mnt_ns->seq >= mnt_ns->seq; -} - struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, int flag) { @@ -1633,7 +1610,7 @@ static int do_change_type(struct path *path, int flag) int type; int err = 0; - if (!ns_capable(mnt->mnt_ns->user_ns, CAP_SYS_ADMIN)) + if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (path->dentry != path->mnt->mnt_root) @@ -1678,10 +1655,6 @@ static int do_loopback(struct path *path, const char *old_name, if (err) return err; - err = -EINVAL; - if (mnt_ns_loop(&old_path)) - goto out; - err = lock_mount(path); if (err) goto out; @@ -1797,7 +1770,7 @@ static int do_move_mount(struct path *path, const char *old_name) struct mount *p; struct mount *old; int err = 0; - if (!ns_capable(real_mount(path->mnt)->mnt_ns->user_ns, CAP_SYS_ADMIN)) + if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (!old_name || !*old_name) return -EINVAL; @@ -1884,6 +1857,21 @@ static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype) return ERR_PTR(err); } +static struct vfsmount * +do_kern_mount(const char *fstype, int flags, const char *name, void *data) +{ + struct file_system_type *type = get_fs_type(fstype); + struct vfsmount *mnt; + if (!type) + return ERR_PTR(-ENODEV); + mnt = vfs_kern_mount(type, flags, name, data); + if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) && + !mnt->mnt_sb->s_subtype) + mnt = fs_set_subtype(mnt, fstype); + put_filesystem(type); + return mnt; +} + /* * add a mount into a namespace's mount tree */ @@ -1929,46 +1917,20 @@ static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags) * create a new mount for userspace and request it to be added into the * namespace's tree */ -static int do_new_mount(struct path *path, const char *fstype, int flags, +static int do_new_mount(struct path *path, const char *type, int flags, int mnt_flags, const char *name, void *data) { - struct file_system_type *type; - struct user_namespace *user_ns; struct vfsmount *mnt; int err; - if (!fstype) + if (!type) return -EINVAL; /* we need capabilities... */ - user_ns = real_mount(path->mnt)->mnt_ns->user_ns; - if (!ns_capable(user_ns, CAP_SYS_ADMIN)) + if (!capable(CAP_SYS_ADMIN)) return -EPERM; - type = get_fs_type(fstype); - if (!type) - return -ENODEV; - - if (user_ns != &init_user_ns) { - if (!(type->fs_flags & FS_USERNS_MOUNT)) { - put_filesystem(type); - return -EPERM; - } - /* Only in special cases allow devices from mounts - * created outside the initial user namespace. - */ - if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) { - flags |= MS_NODEV; - mnt_flags |= MNT_NODEV; - } - } - - mnt = vfs_kern_mount(type, flags, name, data); - if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) && - !mnt->mnt_sb->s_subtype) - mnt = fs_set_subtype(mnt, fstype); - - put_filesystem(type); + mnt = do_kern_mount(type, flags, name, data); if (IS_ERR(mnt)) return PTR_ERR(mnt); @@ -2299,42 +2261,18 @@ long do_mount(const char *dev_name, const char *dir_name, return retval; } -static void free_mnt_ns(struct mnt_namespace *ns) -{ - proc_free_inum(ns->proc_inum); - put_user_ns(ns->user_ns); - kfree(ns); -} - -/* - * Assign a sequence number so we can detect when we attempt to bind - * mount a reference to an older mount namespace into the current - * mount namespace, preventing reference counting loops. A 64bit - * number incrementing at 10Ghz will take 12,427 years to wrap which - * is effectively never, so we can ignore the possibility. - */ -static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1); - -static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns) +static struct mnt_namespace *alloc_mnt_ns(void) { struct mnt_namespace *new_ns; - int ret; new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL); if (!new_ns) return ERR_PTR(-ENOMEM); - ret = proc_alloc_inum(&new_ns->proc_inum); - if (ret) { - kfree(new_ns); - return ERR_PTR(ret); - } - new_ns->seq = atomic64_add_return(1, &mnt_ns_seq); atomic_set(&new_ns->count, 1); new_ns->root = NULL; INIT_LIST_HEAD(&new_ns->list); init_waitqueue_head(&new_ns->poll); new_ns->event = 0; - new_ns->user_ns = get_user_ns(user_ns); return new_ns; } @@ -2343,28 +2281,24 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns) * copied from the namespace of the passed in task structure. */ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, - struct user_namespace *user_ns, struct fs_struct *fs) + struct fs_struct *fs) { struct mnt_namespace *new_ns; struct vfsmount *rootmnt = NULL, *pwdmnt = NULL; struct mount *p, *q; struct mount *old = mnt_ns->root; struct mount *new; - int copy_flags; - new_ns = alloc_mnt_ns(user_ns); + new_ns = alloc_mnt_ns(); if (IS_ERR(new_ns)) return new_ns; down_write(&namespace_sem); /* First pass: copy the tree topology */ - copy_flags = CL_COPY_ALL | CL_EXPIRE; - if (user_ns != mnt_ns->user_ns) - copy_flags |= CL_SHARED_TO_SLAVE; - new = copy_tree(old, old->mnt.mnt_root, copy_flags); + new = copy_tree(old, old->mnt.mnt_root, CL_COPY_ALL | CL_EXPIRE); if (IS_ERR(new)) { up_write(&namespace_sem); - free_mnt_ns(new_ns); + kfree(new_ns); return ERR_CAST(new); } new_ns->root = new; @@ -2405,7 +2339,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, } struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, - struct user_namespace *user_ns, struct fs_struct *new_fs) + struct fs_struct *new_fs) { struct mnt_namespace *new_ns; @@ -2415,7 +2349,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, if (!(flags & CLONE_NEWNS)) return ns; - new_ns = dup_mnt_ns(ns, user_ns, new_fs); + new_ns = dup_mnt_ns(ns, new_fs); put_mnt_ns(ns); return new_ns; @@ -2427,7 +2361,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, */ static struct mnt_namespace *create_mnt_ns(struct vfsmount *m) { - struct mnt_namespace *new_ns = alloc_mnt_ns(&init_user_ns); + struct mnt_namespace *new_ns = alloc_mnt_ns(); if (!IS_ERR(new_ns)) { struct mount *mnt = real_mount(m); mnt->mnt_ns = new_ns; @@ -2567,7 +2501,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, struct mount *new_mnt, *root_mnt; int error; - if (!ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN)) + if (!capable(CAP_SYS_ADMIN)) return -EPERM; error = user_path_dir(new_root, &new); @@ -2649,13 +2583,8 @@ static void __init init_mount_tree(void) struct vfsmount *mnt; struct mnt_namespace *ns; struct path root; - struct file_system_type *type; - type = get_fs_type("rootfs"); - if (!type) - panic("Can't find rootfs type"); - mnt = vfs_kern_mount(type, 0, "rootfs", NULL); - put_filesystem(type); + mnt = do_kern_mount("rootfs", 0, "rootfs", NULL); if (IS_ERR(mnt)) panic("Can't create rootfs"); @@ -2718,7 +2647,7 @@ void put_mnt_ns(struct mnt_namespace *ns) br_write_unlock(&vfsmount_lock); up_write(&namespace_sem); release_mounts(&umount_list); - free_mnt_ns(ns); + kfree(ns); } struct vfsmount *kern_mount_data(struct file_system_type *type, void *data) @@ -2752,71 +2681,3 @@ bool our_mnt(struct vfsmount *mnt) { return check_mnt(real_mount(mnt)); } - -static void *mntns_get(struct task_struct *task) -{ - struct mnt_namespace *ns = NULL; - struct nsproxy *nsproxy; - - rcu_read_lock(); - nsproxy = task_nsproxy(task); - if (nsproxy) { - ns = nsproxy->mnt_ns; - get_mnt_ns(ns); - } - rcu_read_unlock(); - - return ns; -} - -static void mntns_put(void *ns) -{ - put_mnt_ns(ns); -} - -static int mntns_install(struct nsproxy *nsproxy, void *ns) -{ - struct fs_struct *fs = current->fs; - struct mnt_namespace *mnt_ns = ns; - struct path root; - - if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) || - !nsown_capable(CAP_SYS_CHROOT)) - return -EPERM; - - if (fs->users != 1) - return -EINVAL; - - get_mnt_ns(mnt_ns); - put_mnt_ns(nsproxy->mnt_ns); - nsproxy->mnt_ns = mnt_ns; - - /* Find the root */ - root.mnt = &mnt_ns->root->mnt; - root.dentry = mnt_ns->root->mnt.mnt_root; - path_get(&root); - while(d_mountpoint(root.dentry) && follow_down_one(&root)) - ; - - /* Update the pwd and root */ - set_fs_pwd(fs, &root); - set_fs_root(fs, &root); - - path_put(&root); - return 0; -} - -static unsigned int mntns_inum(void *ns) -{ - struct mnt_namespace *mnt_ns = ns; - return mnt_ns->proc_inum; -} - -const struct proc_ns_operations mntns_operations = { - .name = "mnt", - .type = CLONE_NEWNS, - .get = mntns_get, - .put = mntns_put, - .install = mntns_install, - .inum = mntns_inum, -}; diff --git a/trunk/fs/open.c b/trunk/fs/open.c index 182d8667b7bd..59071f55bf7f 100644 --- a/trunk/fs/open.c +++ b/trunk/fs/open.c @@ -435,7 +435,7 @@ SYSCALL_DEFINE1(chroot, const char __user *, filename) goto dput_and_out; error = -EPERM; - if (!nsown_capable(CAP_SYS_CHROOT)) + if (!capable(CAP_SYS_CHROOT)) goto dput_and_out; error = security_path_chroot(&path); if (error) diff --git a/trunk/fs/pnode.h b/trunk/fs/pnode.h index 19b853a3445c..65c60979d541 100644 --- a/trunk/fs/pnode.h +++ b/trunk/fs/pnode.h @@ -22,7 +22,6 @@ #define CL_COPY_ALL 0x04 #define CL_MAKE_SHARED 0x08 #define CL_PRIVATE 0x10 -#define CL_SHARED_TO_SLAVE 0x20 static inline void set_mnt_shared(struct mount *mnt) { diff --git a/trunk/fs/proc/Makefile b/trunk/fs/proc/Makefile index 981b05601931..99349efbbc2b 100644 --- a/trunk/fs/proc/Makefile +++ b/trunk/fs/proc/Makefile @@ -21,7 +21,6 @@ proc-y += uptime.o proc-y += version.o proc-y += softirqs.o proc-y += namespaces.o -proc-y += self.o proc-$(CONFIG_PROC_SYSCTL) += proc_sysctl.o proc-$(CONFIG_NET) += proc_net.o proc-$(CONFIG_PROC_KCORE) += kcore.o diff --git a/trunk/fs/proc/array.c b/trunk/fs/proc/array.c index d66248a1919b..d3696708fc1a 100644 --- a/trunk/fs/proc/array.c +++ b/trunk/fs/proc/array.c @@ -162,7 +162,7 @@ static inline const char *get_task_state(struct task_struct *tsk) static inline void task_state(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *p) { - struct user_namespace *user_ns = seq_user_ns(m); + struct user_namespace *user_ns = current_user_ns(); struct group_info *group_info; int g; struct fdtable *fdt = NULL; diff --git a/trunk/fs/proc/base.c b/trunk/fs/proc/base.c index 5a5a0be40e40..aa63d25157b8 100644 --- a/trunk/fs/proc/base.c +++ b/trunk/fs/proc/base.c @@ -2345,6 +2345,146 @@ static const struct file_operations proc_coredump_filter_operations = { }; #endif +/* + * /proc/self: + */ +static int proc_self_readlink(struct dentry *dentry, char __user *buffer, + int buflen) +{ + struct pid_namespace *ns = dentry->d_sb->s_fs_info; + pid_t tgid = task_tgid_nr_ns(current, ns); + char tmp[PROC_NUMBUF]; + if (!tgid) + return -ENOENT; + sprintf(tmp, "%d", tgid); + return vfs_readlink(dentry,buffer,buflen,tmp); +} + +static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct pid_namespace *ns = dentry->d_sb->s_fs_info; + pid_t tgid = task_tgid_nr_ns(current, ns); + char *name = ERR_PTR(-ENOENT); + if (tgid) { + /* 11 for max length of signed int in decimal + NULL term */ + name = kmalloc(12, GFP_KERNEL); + if (!name) + name = ERR_PTR(-ENOMEM); + else + sprintf(name, "%d", tgid); + } + nd_set_link(nd, name); + return NULL; +} + +static void proc_self_put_link(struct dentry *dentry, struct nameidata *nd, + void *cookie) +{ + char *s = nd_get_link(nd); + if (!IS_ERR(s)) + kfree(s); +} + +static const struct inode_operations proc_self_inode_operations = { + .readlink = proc_self_readlink, + .follow_link = proc_self_follow_link, + .put_link = proc_self_put_link, +}; + +/* + * proc base + * + * These are the directory entries in the root directory of /proc + * that properly belong to the /proc filesystem, as they describe + * describe something that is process related. + */ +static const struct pid_entry proc_base_stuff[] = { + NOD("self", S_IFLNK|S_IRWXUGO, + &proc_self_inode_operations, NULL, {}), +}; + +static struct dentry *proc_base_instantiate(struct inode *dir, + struct dentry *dentry, struct task_struct *task, const void *ptr) +{ + const struct pid_entry *p = ptr; + struct inode *inode; + struct proc_inode *ei; + struct dentry *error; + + /* Allocate the inode */ + error = ERR_PTR(-ENOMEM); + inode = new_inode(dir->i_sb); + if (!inode) + goto out; + + /* Initialize the inode */ + ei = PROC_I(inode); + inode->i_ino = get_next_ino(); + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + + /* + * grab the reference to the task. + */ + ei->pid = get_task_pid(task, PIDTYPE_PID); + if (!ei->pid) + goto out_iput; + + inode->i_mode = p->mode; + if (S_ISDIR(inode->i_mode)) + set_nlink(inode, 2); + if (S_ISLNK(inode->i_mode)) + inode->i_size = 64; + if (p->iop) + inode->i_op = p->iop; + if (p->fop) + inode->i_fop = p->fop; + ei->op = p->op; + d_add(dentry, inode); + error = NULL; +out: + return error; +out_iput: + iput(inode); + goto out; +} + +static struct dentry *proc_base_lookup(struct inode *dir, struct dentry *dentry) +{ + struct dentry *error; + struct task_struct *task = get_proc_task(dir); + const struct pid_entry *p, *last; + + error = ERR_PTR(-ENOENT); + + if (!task) + goto out_no_task; + + /* Lookup the directory entry */ + last = &proc_base_stuff[ARRAY_SIZE(proc_base_stuff) - 1]; + for (p = proc_base_stuff; p <= last; p++) { + if (p->len != dentry->d_name.len) + continue; + if (!memcmp(dentry->d_name.name, p->name, p->len)) + break; + } + if (p > last) + goto out; + + error = proc_base_instantiate(dir, dentry, task, p); + +out: + put_task_struct(task); +out_no_task: + return error; +} + +static int proc_base_fill_cache(struct file *filp, void *dirent, + filldir_t filldir, struct task_struct *task, const struct pid_entry *p) +{ + return proc_fill_cache(filp, dirent, filldir, p->name, p->len, + proc_base_instantiate, task, p); +} + #ifdef CONFIG_TASK_IO_ACCOUNTING static int do_io_accounting(struct task_struct *task, char *buffer, int whole) { @@ -2699,6 +2839,10 @@ void proc_flush_task(struct task_struct *task) proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr, tgid->numbers[i].nr); } + + upid = &pid->numbers[pid->level]; + if (upid->nr == 1) + pid_ns_release_proc(upid->ns); } static struct dentry *proc_pid_instantiate(struct inode *dir, @@ -2732,11 +2876,15 @@ static struct dentry *proc_pid_instantiate(struct inode *dir, struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) { - struct dentry *result = NULL; + struct dentry *result; struct task_struct *task; unsigned tgid; struct pid_namespace *ns; + result = proc_base_lookup(dir, dentry); + if (!IS_ERR(result) || PTR_ERR(result) != -ENOENT) + goto out; + tgid = name_to_int(dentry); if (tgid == ~0U) goto out; @@ -2799,7 +2947,7 @@ static struct tgid_iter next_tgid(struct pid_namespace *ns, struct tgid_iter ite return iter; } -#define TGID_OFFSET (FIRST_PROCESS_ENTRY) +#define TGID_OFFSET (FIRST_PROCESS_ENTRY + ARRAY_SIZE(proc_base_stuff)) static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldir, struct tgid_iter iter) @@ -2819,12 +2967,25 @@ static int fake_filldir(void *buf, const char *name, int namelen, /* for the /proc/ directory itself, after non-process stuff has been done */ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) { + unsigned int nr; + struct task_struct *reaper; struct tgid_iter iter; struct pid_namespace *ns; filldir_t __filldir; if (filp->f_pos >= PID_MAX_LIMIT + TGID_OFFSET) - goto out; + goto out_no_task; + nr = filp->f_pos - FIRST_PROCESS_ENTRY; + + reaper = get_proc_task(filp->f_path.dentry->d_inode); + if (!reaper) + goto out_no_task; + + for (; nr < ARRAY_SIZE(proc_base_stuff); filp->f_pos++, nr++) { + const struct pid_entry *p = &proc_base_stuff[nr]; + if (proc_base_fill_cache(filp, dirent, filldir, reaper, p) < 0) + goto out; + } ns = filp->f_dentry->d_sb->s_fs_info; iter.task = NULL; @@ -2845,6 +3006,8 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) } filp->f_pos = PID_MAX_LIMIT + TGID_OFFSET; out: + put_task_struct(reaper); +out_no_task: return 0; } diff --git a/trunk/fs/proc/generic.c b/trunk/fs/proc/generic.c index 7b3ae3cc0ef9..0d80cef4cfb9 100644 --- a/trunk/fs/proc/generic.c +++ b/trunk/fs/proc/generic.c @@ -350,14 +350,14 @@ static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */ * Return an inode number between PROC_DYNAMIC_FIRST and * 0xffffffff, or zero on failure. */ -int proc_alloc_inum(unsigned int *inum) +static unsigned int get_inode_number(void) { unsigned int i; int error; retry: - if (!ida_pre_get(&proc_inum_ida, GFP_KERNEL)) - return -ENOMEM; + if (ida_pre_get(&proc_inum_ida, GFP_KERNEL) == 0) + return 0; spin_lock(&proc_inum_lock); error = ida_get_new(&proc_inum_ida, &i); @@ -365,19 +365,18 @@ int proc_alloc_inum(unsigned int *inum) if (error == -EAGAIN) goto retry; else if (error) - return error; + return 0; if (i > UINT_MAX - PROC_DYNAMIC_FIRST) { spin_lock(&proc_inum_lock); ida_remove(&proc_inum_ida, i); spin_unlock(&proc_inum_lock); - return -ENOSPC; + return 0; } - *inum = PROC_DYNAMIC_FIRST + i; - return 0; + return PROC_DYNAMIC_FIRST + i; } -void proc_free_inum(unsigned int inum) +static void release_inode_number(unsigned int inum) { spin_lock(&proc_inum_lock); ida_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST); @@ -555,12 +554,13 @@ static const struct inode_operations proc_dir_inode_operations = { static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp) { + unsigned int i; struct proc_dir_entry *tmp; - int ret; - ret = proc_alloc_inum(&dp->low_ino); - if (ret) - return ret; + i = get_inode_number(); + if (i == 0) + return -EAGAIN; + dp->low_ino = i; if (S_ISDIR(dp->mode)) { if (dp->proc_iops == NULL) { @@ -764,7 +764,7 @@ EXPORT_SYMBOL(proc_create_data); static void free_proc_entry(struct proc_dir_entry *de) { - proc_free_inum(de->low_ino); + release_inode_number(de->low_ino); if (S_ISLNK(de->mode)) kfree(de->data); diff --git a/trunk/fs/proc/inode.c b/trunk/fs/proc/inode.c index 439ae6886507..3b22bbdee9ec 100644 --- a/trunk/fs/proc/inode.c +++ b/trunk/fs/proc/inode.c @@ -31,7 +31,6 @@ static void proc_evict_inode(struct inode *inode) struct proc_dir_entry *de; struct ctl_table_header *head; const struct proc_ns_operations *ns_ops; - void *ns; truncate_inode_pages(&inode->i_data, 0); clear_inode(inode); @@ -50,9 +49,8 @@ static void proc_evict_inode(struct inode *inode) } /* Release any associated namespace */ ns_ops = PROC_I(inode)->ns_ops; - ns = PROC_I(inode)->ns; - if (ns_ops && ns) - ns_ops->put(ns); + if (ns_ops && ns_ops->put) + ns_ops->put(PROC_I(inode)->ns); } static struct kmem_cache * proc_inode_cachep; diff --git a/trunk/fs/proc/internal.h b/trunk/fs/proc/internal.h index 252544c05207..43973b084abf 100644 --- a/trunk/fs/proc/internal.h +++ b/trunk/fs/proc/internal.h @@ -15,7 +15,6 @@ struct ctl_table_header; struct mempolicy; extern struct proc_dir_entry proc_root; -extern void proc_self_init(void); #ifdef CONFIG_PROC_SYSCTL extern int proc_sys_init(void); extern void sysctl_head_put(struct ctl_table_header *head); diff --git a/trunk/fs/proc/namespaces.c b/trunk/fs/proc/namespaces.c index b7a47196c8c3..b178ed733c36 100644 --- a/trunk/fs/proc/namespaces.c +++ b/trunk/fs/proc/namespaces.c @@ -11,7 +11,6 @@ #include #include #include -#include #include "internal.h" @@ -25,168 +24,12 @@ static const struct proc_ns_operations *ns_entries[] = { #ifdef CONFIG_IPC_NS &ipcns_operations, #endif -#ifdef CONFIG_PID_NS - &pidns_operations, -#endif -#ifdef CONFIG_USER_NS - &userns_operations, -#endif - &mntns_operations, }; static const struct file_operations ns_file_operations = { .llseek = no_llseek, }; -static const struct inode_operations ns_inode_operations = { - .setattr = proc_setattr, -}; - -static int ns_delete_dentry(const struct dentry *dentry) -{ - /* Don't cache namespace inodes when not in use */ - return 1; -} - -static char *ns_dname(struct dentry *dentry, char *buffer, int buflen) -{ - struct inode *inode = dentry->d_inode; - const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops; - - return dynamic_dname(dentry, buffer, buflen, "%s:[%lu]", - ns_ops->name, inode->i_ino); -} - -const struct dentry_operations ns_dentry_operations = -{ - .d_delete = ns_delete_dentry, - .d_dname = ns_dname, -}; - -static struct dentry *proc_ns_get_dentry(struct super_block *sb, - struct task_struct *task, const struct proc_ns_operations *ns_ops) -{ - struct dentry *dentry, *result; - struct inode *inode; - struct proc_inode *ei; - struct qstr qname = { .name = "", }; - void *ns; - - ns = ns_ops->get(task); - if (!ns) - return ERR_PTR(-ENOENT); - - dentry = d_alloc_pseudo(sb, &qname); - if (!dentry) { - ns_ops->put(ns); - return ERR_PTR(-ENOMEM); - } - - inode = iget_locked(sb, ns_ops->inum(ns)); - if (!inode) { - dput(dentry); - ns_ops->put(ns); - return ERR_PTR(-ENOMEM); - } - - ei = PROC_I(inode); - if (inode->i_state & I_NEW) { - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; - inode->i_op = &ns_inode_operations; - inode->i_mode = S_IFREG | S_IRUGO; - inode->i_fop = &ns_file_operations; - ei->ns_ops = ns_ops; - ei->ns = ns; - unlock_new_inode(inode); - } else { - ns_ops->put(ns); - } - - d_set_d_op(dentry, &ns_dentry_operations); - result = d_instantiate_unique(dentry, inode); - if (result) { - dput(dentry); - dentry = result; - } - - return dentry; -} - -static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd) -{ - struct inode *inode = dentry->d_inode; - struct super_block *sb = inode->i_sb; - struct proc_inode *ei = PROC_I(inode); - struct task_struct *task; - struct dentry *ns_dentry; - void *error = ERR_PTR(-EACCES); - - task = get_proc_task(inode); - if (!task) - goto out; - - if (!ptrace_may_access(task, PTRACE_MODE_READ)) - goto out_put_task; - - ns_dentry = proc_ns_get_dentry(sb, task, ei->ns_ops); - if (IS_ERR(ns_dentry)) { - error = ERR_CAST(ns_dentry); - goto out_put_task; - } - - dput(nd->path.dentry); - nd->path.dentry = ns_dentry; - error = NULL; - -out_put_task: - put_task_struct(task); -out: - return error; -} - -static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int buflen) -{ - struct inode *inode = dentry->d_inode; - struct proc_inode *ei = PROC_I(inode); - const struct proc_ns_operations *ns_ops = ei->ns_ops; - struct task_struct *task; - void *ns; - char name[50]; - int len = -EACCES; - - task = get_proc_task(inode); - if (!task) - goto out; - - if (!ptrace_may_access(task, PTRACE_MODE_READ)) - goto out_put_task; - - len = -ENOENT; - ns = ns_ops->get(task); - if (!ns) - goto out_put_task; - - snprintf(name, sizeof(name), "%s:[%u]", ns_ops->name, ns_ops->inum(ns)); - len = strlen(name); - - if (len > buflen) - len = buflen; - if (copy_to_user(buffer, name, len)) - len = -EFAULT; - - ns_ops->put(ns); -out_put_task: - put_task_struct(task); -out: - return len; -} - -static const struct inode_operations proc_ns_link_inode_operations = { - .readlink = proc_ns_readlink, - .follow_link = proc_ns_follow_link, - .setattr = proc_setattr, -}; - static struct dentry *proc_ns_instantiate(struct inode *dir, struct dentry *dentry, struct task_struct *task, const void *ptr) { @@ -194,15 +37,21 @@ static struct dentry *proc_ns_instantiate(struct inode *dir, struct inode *inode; struct proc_inode *ei; struct dentry *error = ERR_PTR(-ENOENT); + void *ns; inode = proc_pid_make_inode(dir->i_sb, task); if (!inode) goto out; + ns = ns_ops->get(task); + if (!ns) + goto out_iput; + ei = PROC_I(inode); - inode->i_mode = S_IFLNK|S_IRWXUGO; - inode->i_op = &proc_ns_link_inode_operations; - ei->ns_ops = ns_ops; + inode->i_mode = S_IFREG|S_IRUSR; + inode->i_fop = &ns_file_operations; + ei->ns_ops = ns_ops; + ei->ns = ns; d_set_d_op(dentry, &pid_dentry_operations); d_add(dentry, inode); @@ -211,6 +60,9 @@ static struct dentry *proc_ns_instantiate(struct inode *dir, error = NULL; out: return error; +out_iput: + iput(inode); + goto out; } static int proc_ns_fill_cache(struct file *filp, void *dirent, @@ -237,6 +89,10 @@ static int proc_ns_dir_readdir(struct file *filp, void *dirent, if (!task) goto out_no_task; + ret = -EPERM; + if (!ptrace_may_access(task, PTRACE_MODE_READ)) + goto out; + ret = 0; i = filp->f_pos; switch (i) { @@ -296,6 +152,10 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir, if (!task) goto out_no_task; + error = ERR_PTR(-EPERM); + if (!ptrace_may_access(task, PTRACE_MODE_READ)) + goto out; + last = &ns_entries[ARRAY_SIZE(ns_entries)]; for (entry = ns_entries; entry < last; entry++) { if (strlen((*entry)->name) != len) @@ -303,6 +163,7 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir, if (!memcmp(dentry->d_name.name, (*entry)->name, len)) break; } + error = ERR_PTR(-ENOENT); if (entry == last) goto out; @@ -337,7 +198,3 @@ struct file *proc_ns_fget(int fd) return ERR_PTR(-EINVAL); } -bool proc_ns_inode(struct inode *inode) -{ - return inode->i_fop == &ns_file_operations; -} diff --git a/trunk/fs/proc/root.c b/trunk/fs/proc/root.c index c6e9fac26bac..9889a92d2e01 100644 --- a/trunk/fs/proc/root.c +++ b/trunk/fs/proc/root.c @@ -100,13 +100,14 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, int err; struct super_block *sb; struct pid_namespace *ns; + struct proc_inode *ei; char *options; if (flags & MS_KERNMOUNT) { ns = (struct pid_namespace *)data; options = NULL; } else { - ns = task_active_pid_ns(current); + ns = current->nsproxy->pid_ns; options = data; } @@ -129,6 +130,13 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, sb->s_flags |= MS_ACTIVE; } + ei = PROC_I(sb->s_root->d_inode); + if (!ei->pid) { + rcu_read_lock(); + ei->pid = get_pid(find_pid_ns(1, ns)); + rcu_read_unlock(); + } + return dget(sb->s_root); } @@ -145,7 +153,6 @@ static struct file_system_type proc_fs_type = { .name = "proc", .mount = proc_mount, .kill_sb = proc_kill_sb, - .fs_flags = FS_USERNS_MOUNT, }; void __init proc_root_init(void) @@ -156,8 +163,12 @@ void __init proc_root_init(void) err = register_filesystem(&proc_fs_type); if (err) return; + err = pid_ns_prepare_proc(&init_pid_ns); + if (err) { + unregister_filesystem(&proc_fs_type); + return; + } - proc_self_init(); proc_symlink("mounts", NULL, "self/mounts"); proc_net_init(); diff --git a/trunk/fs/proc/self.c b/trunk/fs/proc/self.c deleted file mode 100644 index aa5cc3bff140..000000000000 --- a/trunk/fs/proc/self.c +++ /dev/null @@ -1,59 +0,0 @@ -#include -#include -#include - -/* - * /proc/self: - */ -static int proc_self_readlink(struct dentry *dentry, char __user *buffer, - int buflen) -{ - struct pid_namespace *ns = dentry->d_sb->s_fs_info; - pid_t tgid = task_tgid_nr_ns(current, ns); - char tmp[PROC_NUMBUF]; - if (!tgid) - return -ENOENT; - sprintf(tmp, "%d", tgid); - return vfs_readlink(dentry,buffer,buflen,tmp); -} - -static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd) -{ - struct pid_namespace *ns = dentry->d_sb->s_fs_info; - pid_t tgid = task_tgid_nr_ns(current, ns); - char *name = ERR_PTR(-ENOENT); - if (tgid) { - /* 11 for max length of signed int in decimal + NULL term */ - name = kmalloc(12, GFP_KERNEL); - if (!name) - name = ERR_PTR(-ENOMEM); - else - sprintf(name, "%d", tgid); - } - nd_set_link(nd, name); - return NULL; -} - -static void proc_self_put_link(struct dentry *dentry, struct nameidata *nd, - void *cookie) -{ - char *s = nd_get_link(nd); - if (!IS_ERR(s)) - kfree(s); -} - -static const struct inode_operations proc_self_inode_operations = { - .readlink = proc_self_readlink, - .follow_link = proc_self_follow_link, - .put_link = proc_self_put_link, -}; - -void __init proc_self_init(void) -{ - struct proc_dir_entry *proc_self_symlink; - mode_t mode; - - mode = S_IFLNK | S_IRWXUGO; - proc_self_symlink = proc_create("self", mode, NULL, NULL ); - proc_self_symlink->proc_iops = &proc_self_inode_operations; -} diff --git a/trunk/fs/sysfs/mount.c b/trunk/fs/sysfs/mount.c index db940a9be045..71eb7e253927 100644 --- a/trunk/fs/sysfs/mount.c +++ b/trunk/fs/sysfs/mount.c @@ -149,7 +149,6 @@ static struct file_system_type sysfs_fs_type = { .name = "sysfs", .mount = sysfs_mount, .kill_sb = sysfs_kill_sb, - .fs_flags = FS_USERNS_MOUNT, }; int __init sysfs_init(void) diff --git a/trunk/include/linux/cred.h b/trunk/include/linux/cred.h index abb2cd50f6b2..0142aacb70b7 100644 --- a/trunk/include/linux/cred.h +++ b/trunk/include/linux/cred.h @@ -344,8 +344,10 @@ static inline void put_cred(const struct cred *_cred) extern struct user_namespace init_user_ns; #ifdef CONFIG_USER_NS #define current_user_ns() (current_cred_xxx(user_ns)) +#define task_user_ns(task) (task_cred_xxx((task), user_ns)) #else #define current_user_ns() (&init_user_ns) +#define task_user_ns(task) (&init_user_ns) #endif diff --git a/trunk/include/linux/drbd.h b/trunk/include/linux/drbd.h index 0c5a18ec322c..47e3d4850584 100644 --- a/trunk/include/linux/drbd.h +++ b/trunk/include/linux/drbd.h @@ -51,11 +51,12 @@ #endif + extern const char *drbd_buildtag(void); -#define REL_VERSION "8.4.2" -#define API_VERSION 1 +#define REL_VERSION "8.3.13" +#define API_VERSION 88 #define PRO_VERSION_MIN 86 -#define PRO_VERSION_MAX 101 +#define PRO_VERSION_MAX 96 enum drbd_io_error_p { @@ -65,8 +66,7 @@ enum drbd_io_error_p { }; enum drbd_fencing_p { - FP_NOT_AVAIL = -1, /* Not a policy */ - FP_DONT_CARE = 0, + FP_DONT_CARE, FP_RESOURCE, FP_STONITH }; @@ -102,20 +102,6 @@ enum drbd_on_congestion { OC_DISCONNECT, }; -enum drbd_read_balancing { - RB_PREFER_LOCAL, - RB_PREFER_REMOTE, - RB_ROUND_ROBIN, - RB_LEAST_PENDING, - RB_CONGESTED_REMOTE, - RB_32K_STRIPING, - RB_64K_STRIPING, - RB_128K_STRIPING, - RB_256K_STRIPING, - RB_512K_STRIPING, - RB_1M_STRIPING, -}; - /* KEEP the order, do not delete or insert. Only append. */ enum drbd_ret_code { ERR_CODE_BASE = 100, @@ -136,7 +122,7 @@ enum drbd_ret_code { ERR_AUTH_ALG = 120, ERR_AUTH_ALG_ND = 121, ERR_NOMEM = 122, - ERR_DISCARD_IMPOSSIBLE = 123, + ERR_DISCARD = 123, ERR_DISK_CONFIGURED = 124, ERR_NET_CONFIGURED = 125, ERR_MANDATORY_TAG = 126, @@ -144,8 +130,8 @@ enum drbd_ret_code { ERR_INTR = 129, /* EINTR */ ERR_RESIZE_RESYNC = 130, ERR_NO_PRIMARY = 131, - ERR_RESYNC_AFTER = 132, - ERR_RESYNC_AFTER_CYCLE = 133, + ERR_SYNC_AFTER = 132, + ERR_SYNC_AFTER_CYCLE = 133, ERR_PAUSE_IS_SET = 134, ERR_PAUSE_IS_CLEAR = 135, ERR_PACKET_NR = 137, @@ -169,14 +155,6 @@ enum drbd_ret_code { ERR_CONG_NOT_PROTO_A = 155, ERR_PIC_AFTER_DEP = 156, ERR_PIC_PEER_DEP = 157, - ERR_RES_NOT_KNOWN = 158, - ERR_RES_IN_USE = 159, - ERR_MINOR_CONFIGURED = 160, - ERR_MINOR_EXISTS = 161, - ERR_INVALID_REQUEST = 162, - ERR_NEED_APV_100 = 163, - ERR_NEED_ALLOW_TWO_PRI = 164, - ERR_MD_UNCLEAN = 165, /* insert new ones above this line */ AFTER_LAST_ERR_CODE @@ -318,8 +296,7 @@ enum drbd_state_rv { SS_NOT_SUPPORTED = -17, /* drbd-8.2 only */ SS_IN_TRANSIENT_STATE = -18, /* Retry after the next state change */ SS_CONCURRENT_ST_CHG = -19, /* Concurrent cluster side state change! */ - SS_O_VOL_PEER_PRI = -20, - SS_AFTER_LAST_ERROR = -21, /* Keep this at bottom */ + SS_AFTER_LAST_ERROR = -20, /* Keep this at bottom */ }; /* from drbd_strings.c */ @@ -336,9 +313,7 @@ extern const char *drbd_set_st_err_str(enum drbd_state_rv); #define MDF_FULL_SYNC (1 << 3) #define MDF_WAS_UP_TO_DATE (1 << 4) #define MDF_PEER_OUT_DATED (1 << 5) -#define MDF_CRASHED_PRIMARY (1 << 6) -#define MDF_AL_CLEAN (1 << 7) -#define MDF_AL_DISABLED (1 << 8) +#define MDF_CRASHED_PRIMARY (1 << 6) enum drbd_uuid_index { UI_CURRENT, @@ -358,23 +333,37 @@ enum drbd_timeout_flag { #define UUID_JUST_CREATED ((__u64)4) -/* magic numbers used in meta data and network packets */ #define DRBD_MAGIC 0x83740267 +#define BE_DRBD_MAGIC __constant_cpu_to_be32(DRBD_MAGIC) #define DRBD_MAGIC_BIG 0x835a -#define DRBD_MAGIC_100 0x8620ec20 - -#define DRBD_MD_MAGIC_07 (DRBD_MAGIC+3) -#define DRBD_MD_MAGIC_08 (DRBD_MAGIC+4) -#define DRBD_MD_MAGIC_84_UNCLEAN (DRBD_MAGIC+5) - - -/* how I came up with this magic? - * base64 decode "actlog==" ;) */ -#define DRBD_AL_MAGIC 0x69cb65a2 +#define BE_DRBD_MAGIC_BIG __constant_cpu_to_be16(DRBD_MAGIC_BIG) /* these are of type "int" */ #define DRBD_MD_INDEX_INTERNAL -1 #define DRBD_MD_INDEX_FLEX_EXT -2 #define DRBD_MD_INDEX_FLEX_INT -3 +/* Start of the new netlink/connector stuff */ + +#define DRBD_NL_CREATE_DEVICE 0x01 +#define DRBD_NL_SET_DEFAULTS 0x02 + + +/* For searching a vacant cn_idx value */ +#define CN_IDX_STEP 6977 + +struct drbd_nl_cfg_req { + int packet_type; + unsigned int drbd_minor; + int flags; + unsigned short tag_list[]; +}; + +struct drbd_nl_cfg_reply { + int packet_type; + unsigned int minor; + int ret_code; /* enum ret_code or set_st_err_t */ + unsigned short tag_list[]; /* only used with get_* calls */ +}; + #endif diff --git a/trunk/include/linux/drbd_genl.h b/trunk/include/linux/drbd_genl.h deleted file mode 100644 index d0d8fac8a6e4..000000000000 --- a/trunk/include/linux/drbd_genl.h +++ /dev/null @@ -1,378 +0,0 @@ -/* - * General overview: - * full generic netlink message: - * |nlmsghdr|genlmsghdr| - * - * payload: - * |optional fixed size family header| - * - * sequence of netlink attributes: - * I chose to have all "top level" attributes NLA_NESTED, - * corresponding to some real struct. - * So we have a sequence of |tla, len| - * - * nested nla sequence: - * may be empty, or contain a sequence of netlink attributes - * representing the struct fields. - * - * The tag number of any field (regardless of containing struct) - * will be available as T_ ## field_name, - * so you cannot have the same field name in two differnt structs. - * - * The tag numbers themselves are per struct, though, - * so should always begin at 1 (not 0, that is the special "NLA_UNSPEC" type, - * which we won't use here). - * The tag numbers are used as index in the respective nla_policy array. - * - * GENL_struct(tag_name, tag_number, struct name, struct fields) - struct and policy - * genl_magic_struct.h - * generates the struct declaration, - * generates an entry in the tla enum, - * genl_magic_func.h - * generates an entry in the static tla policy - * with .type = NLA_NESTED - * generates the static _nl_policy definition, - * and static conversion functions - * - * genl_magic_func.h - * - * GENL_mc_group(group) - * genl_magic_struct.h - * does nothing - * genl_magic_func.h - * defines and registers the mcast group, - * and provides a send helper - * - * GENL_notification(op_name, op_num, mcast_group, tla list) - * These are notifications to userspace. - * - * genl_magic_struct.h - * generates an entry in the genl_ops enum, - * genl_magic_func.h - * does nothing - * - * mcast group: the name of the mcast group this notification should be - * expected on - * tla list: the list of expected top level attributes, - * for documentation and sanity checking. - * - * GENL_op(op_name, op_num, flags and handler, tla list) - "genl operations" - * These are requests from userspace. - * - * _op and _notification share the same "number space", - * op_nr will be assigned to "genlmsghdr->cmd" - * - * genl_magic_struct.h - * generates an entry in the genl_ops enum, - * genl_magic_func.h - * generates an entry in the static genl_ops array, - * and static register/unregister functions to - * genl_register_family_with_ops(). - * - * flags and handler: - * GENL_op_init( .doit = x, .dumpit = y, .flags = something) - * GENL_doit(x) => .dumpit = NULL, .flags = GENL_ADMIN_PERM - * tla list: the list of expected top level attributes, - * for documentation and sanity checking. - */ - -/* - * STRUCTS - */ - -/* this is sent kernel -> userland on various error conditions, and contains - * informational textual info, which is supposedly human readable. - * The computer relevant return code is in the drbd_genlmsghdr. - */ -GENL_struct(DRBD_NLA_CFG_REPLY, 1, drbd_cfg_reply, - /* "arbitrary" size strings, nla_policy.len = 0 */ - __str_field(1, DRBD_GENLA_F_MANDATORY, info_text, 0) -) - -/* Configuration requests typically need a context to operate on. - * Possible keys are device minor (fits in the drbd_genlmsghdr), - * the replication link (aka connection) name, - * and/or the replication group (aka resource) name, - * and the volume id within the resource. */ -GENL_struct(DRBD_NLA_CFG_CONTEXT, 2, drbd_cfg_context, - __u32_field(1, DRBD_GENLA_F_MANDATORY, ctx_volume) - __str_field(2, DRBD_GENLA_F_MANDATORY, ctx_resource_name, 128) - __bin_field(3, DRBD_GENLA_F_MANDATORY, ctx_my_addr, 128) - __bin_field(4, DRBD_GENLA_F_MANDATORY, ctx_peer_addr, 128) -) - -GENL_struct(DRBD_NLA_DISK_CONF, 3, disk_conf, - __str_field(1, DRBD_F_REQUIRED | DRBD_F_INVARIANT, backing_dev, 128) - __str_field(2, DRBD_F_REQUIRED | DRBD_F_INVARIANT, meta_dev, 128) - __s32_field(3, DRBD_F_REQUIRED | DRBD_F_INVARIANT, meta_dev_idx) - - /* use the resize command to try and change the disk_size */ - __u64_field(4, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT, disk_size) - /* we could change the max_bio_bvecs, - * but it won't propagate through the stack */ - __u32_field(5, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT, max_bio_bvecs) - - __u32_field_def(6, DRBD_GENLA_F_MANDATORY, on_io_error, DRBD_ON_IO_ERROR_DEF) - __u32_field_def(7, DRBD_GENLA_F_MANDATORY, fencing, DRBD_FENCING_DEF) - - __u32_field_def(8, DRBD_GENLA_F_MANDATORY, resync_rate, DRBD_RESYNC_RATE_DEF) - __s32_field_def(9, DRBD_GENLA_F_MANDATORY, resync_after, DRBD_MINOR_NUMBER_DEF) - __u32_field_def(10, DRBD_GENLA_F_MANDATORY, al_extents, DRBD_AL_EXTENTS_DEF) - __u32_field_def(11, DRBD_GENLA_F_MANDATORY, c_plan_ahead, DRBD_C_PLAN_AHEAD_DEF) - __u32_field_def(12, DRBD_GENLA_F_MANDATORY, c_delay_target, DRBD_C_DELAY_TARGET_DEF) - __u32_field_def(13, DRBD_GENLA_F_MANDATORY, c_fill_target, DRBD_C_FILL_TARGET_DEF) - __u32_field_def(14, DRBD_GENLA_F_MANDATORY, c_max_rate, DRBD_C_MAX_RATE_DEF) - __u32_field_def(15, DRBD_GENLA_F_MANDATORY, c_min_rate, DRBD_C_MIN_RATE_DEF) - - __flg_field_def(16, DRBD_GENLA_F_MANDATORY, disk_barrier, DRBD_DISK_BARRIER_DEF) - __flg_field_def(17, DRBD_GENLA_F_MANDATORY, disk_flushes, DRBD_DISK_FLUSHES_DEF) - __flg_field_def(18, DRBD_GENLA_F_MANDATORY, disk_drain, DRBD_DISK_DRAIN_DEF) - __flg_field_def(19, DRBD_GENLA_F_MANDATORY, md_flushes, DRBD_MD_FLUSHES_DEF) - __u32_field_def(20, DRBD_GENLA_F_MANDATORY, disk_timeout, DRBD_DISK_TIMEOUT_DEF) - __u32_field_def(21, 0 /* OPTIONAL */, read_balancing, DRBD_READ_BALANCING_DEF) - /* 9: __u32_field_def(22, DRBD_GENLA_F_MANDATORY, unplug_watermark, DRBD_UNPLUG_WATERMARK_DEF) */ - __flg_field_def(23, 0 /* OPTIONAL */, al_updates, DRBD_AL_UPDATES_DEF) -) - -GENL_struct(DRBD_NLA_RESOURCE_OPTS, 4, res_opts, - __str_field_def(1, DRBD_GENLA_F_MANDATORY, cpu_mask, 32) - __u32_field_def(2, DRBD_GENLA_F_MANDATORY, on_no_data, DRBD_ON_NO_DATA_DEF) -) - -GENL_struct(DRBD_NLA_NET_CONF, 5, net_conf, - __str_field_def(1, DRBD_GENLA_F_MANDATORY | DRBD_F_SENSITIVE, - shared_secret, SHARED_SECRET_MAX) - __str_field_def(2, DRBD_GENLA_F_MANDATORY, cram_hmac_alg, SHARED_SECRET_MAX) - __str_field_def(3, DRBD_GENLA_F_MANDATORY, integrity_alg, SHARED_SECRET_MAX) - __str_field_def(4, DRBD_GENLA_F_MANDATORY, verify_alg, SHARED_SECRET_MAX) - __str_field_def(5, DRBD_GENLA_F_MANDATORY, csums_alg, SHARED_SECRET_MAX) - __u32_field_def(6, DRBD_GENLA_F_MANDATORY, wire_protocol, DRBD_PROTOCOL_DEF) - __u32_field_def(7, DRBD_GENLA_F_MANDATORY, connect_int, DRBD_CONNECT_INT_DEF) - __u32_field_def(8, DRBD_GENLA_F_MANDATORY, timeout, DRBD_TIMEOUT_DEF) - __u32_field_def(9, DRBD_GENLA_F_MANDATORY, ping_int, DRBD_PING_INT_DEF) - __u32_field_def(10, DRBD_GENLA_F_MANDATORY, ping_timeo, DRBD_PING_TIMEO_DEF) - __u32_field_def(11, DRBD_GENLA_F_MANDATORY, sndbuf_size, DRBD_SNDBUF_SIZE_DEF) - __u32_field_def(12, DRBD_GENLA_F_MANDATORY, rcvbuf_size, DRBD_RCVBUF_SIZE_DEF) - __u32_field_def(13, DRBD_GENLA_F_MANDATORY, ko_count, DRBD_KO_COUNT_DEF) - __u32_field_def(14, DRBD_GENLA_F_MANDATORY, max_buffers, DRBD_MAX_BUFFERS_DEF) - __u32_field_def(15, DRBD_GENLA_F_MANDATORY, max_epoch_size, DRBD_MAX_EPOCH_SIZE_DEF) - __u32_field_def(16, DRBD_GENLA_F_MANDATORY, unplug_watermark, DRBD_UNPLUG_WATERMARK_DEF) - __u32_field_def(17, DRBD_GENLA_F_MANDATORY, after_sb_0p, DRBD_AFTER_SB_0P_DEF) - __u32_field_def(18, DRBD_GENLA_F_MANDATORY, after_sb_1p, DRBD_AFTER_SB_1P_DEF) - __u32_field_def(19, DRBD_GENLA_F_MANDATORY, after_sb_2p, DRBD_AFTER_SB_2P_DEF) - __u32_field_def(20, DRBD_GENLA_F_MANDATORY, rr_conflict, DRBD_RR_CONFLICT_DEF) - __u32_field_def(21, DRBD_GENLA_F_MANDATORY, on_congestion, DRBD_ON_CONGESTION_DEF) - __u32_field_def(22, DRBD_GENLA_F_MANDATORY, cong_fill, DRBD_CONG_FILL_DEF) - __u32_field_def(23, DRBD_GENLA_F_MANDATORY, cong_extents, DRBD_CONG_EXTENTS_DEF) - __flg_field_def(24, DRBD_GENLA_F_MANDATORY, two_primaries, DRBD_ALLOW_TWO_PRIMARIES_DEF) - __flg_field(25, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT, discard_my_data) - __flg_field_def(26, DRBD_GENLA_F_MANDATORY, tcp_cork, DRBD_TCP_CORK_DEF) - __flg_field_def(27, DRBD_GENLA_F_MANDATORY, always_asbp, DRBD_ALWAYS_ASBP_DEF) - __flg_field(28, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT, tentative) - __flg_field_def(29, DRBD_GENLA_F_MANDATORY, use_rle, DRBD_USE_RLE_DEF) - /* 9: __u32_field_def(30, DRBD_GENLA_F_MANDATORY, fencing_policy, DRBD_FENCING_DEF) */ -) - -GENL_struct(DRBD_NLA_SET_ROLE_PARMS, 6, set_role_parms, - __flg_field(1, DRBD_GENLA_F_MANDATORY, assume_uptodate) -) - -GENL_struct(DRBD_NLA_RESIZE_PARMS, 7, resize_parms, - __u64_field(1, DRBD_GENLA_F_MANDATORY, resize_size) - __flg_field(2, DRBD_GENLA_F_MANDATORY, resize_force) - __flg_field(3, DRBD_GENLA_F_MANDATORY, no_resync) -) - -GENL_struct(DRBD_NLA_STATE_INFO, 8, state_info, - /* the reason of the broadcast, - * if this is an event triggered broadcast. */ - __u32_field(1, DRBD_GENLA_F_MANDATORY, sib_reason) - __u32_field(2, DRBD_F_REQUIRED, current_state) - __u64_field(3, DRBD_GENLA_F_MANDATORY, capacity) - __u64_field(4, DRBD_GENLA_F_MANDATORY, ed_uuid) - - /* These are for broadcast from after state change work. - * prev_state and new_state are from the moment the state change took - * place, new_state is not neccessarily the same as current_state, - * there may have been more state changes since. Which will be - * broadcasted soon, in their respective after state change work. */ - __u32_field(5, DRBD_GENLA_F_MANDATORY, prev_state) - __u32_field(6, DRBD_GENLA_F_MANDATORY, new_state) - - /* if we have a local disk: */ - __bin_field(7, DRBD_GENLA_F_MANDATORY, uuids, (UI_SIZE*sizeof(__u64))) - __u32_field(8, DRBD_GENLA_F_MANDATORY, disk_flags) - __u64_field(9, DRBD_GENLA_F_MANDATORY, bits_total) - __u64_field(10, DRBD_GENLA_F_MANDATORY, bits_oos) - /* and in case resync or online verify is active */ - __u64_field(11, DRBD_GENLA_F_MANDATORY, bits_rs_total) - __u64_field(12, DRBD_GENLA_F_MANDATORY, bits_rs_failed) - - /* for pre and post notifications of helper execution */ - __str_field(13, DRBD_GENLA_F_MANDATORY, helper, 32) - __u32_field(14, DRBD_GENLA_F_MANDATORY, helper_exit_code) - - __u64_field(15, 0, send_cnt) - __u64_field(16, 0, recv_cnt) - __u64_field(17, 0, read_cnt) - __u64_field(18, 0, writ_cnt) - __u64_field(19, 0, al_writ_cnt) - __u64_field(20, 0, bm_writ_cnt) - __u32_field(21, 0, ap_bio_cnt) - __u32_field(22, 0, ap_pending_cnt) - __u32_field(23, 0, rs_pending_cnt) -) - -GENL_struct(DRBD_NLA_START_OV_PARMS, 9, start_ov_parms, - __u64_field(1, DRBD_GENLA_F_MANDATORY, ov_start_sector) - __u64_field(2, DRBD_GENLA_F_MANDATORY, ov_stop_sector) -) - -GENL_struct(DRBD_NLA_NEW_C_UUID_PARMS, 10, new_c_uuid_parms, - __flg_field(1, DRBD_GENLA_F_MANDATORY, clear_bm) -) - -GENL_struct(DRBD_NLA_TIMEOUT_PARMS, 11, timeout_parms, - __u32_field(1, DRBD_F_REQUIRED, timeout_type) -) - -GENL_struct(DRBD_NLA_DISCONNECT_PARMS, 12, disconnect_parms, - __flg_field(1, DRBD_GENLA_F_MANDATORY, force_disconnect) -) - -GENL_struct(DRBD_NLA_DETACH_PARMS, 13, detach_parms, - __flg_field(1, DRBD_GENLA_F_MANDATORY, force_detach) -) - -/* - * Notifications and commands (genlmsghdr->cmd) - */ -GENL_mc_group(events) - - /* kernel -> userspace announcement of changes */ -GENL_notification( - DRBD_EVENT, 1, events, - GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) - GENL_tla_expected(DRBD_NLA_STATE_INFO, DRBD_F_REQUIRED) - GENL_tla_expected(DRBD_NLA_NET_CONF, DRBD_GENLA_F_MANDATORY) - GENL_tla_expected(DRBD_NLA_DISK_CONF, DRBD_GENLA_F_MANDATORY) - GENL_tla_expected(DRBD_NLA_SYNCER_CONF, DRBD_GENLA_F_MANDATORY) -) - - /* query kernel for specific or all info */ -GENL_op( - DRBD_ADM_GET_STATUS, 2, - GENL_op_init( - .doit = drbd_adm_get_status, - .dumpit = drbd_adm_get_status_all, - /* anyone may ask for the status, - * it is broadcasted anyways */ - ), - /* To select the object .doit. - * Or a subset of objects in .dumpit. */ - GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY) -) - - /* add DRBD minor devices as volumes to resources */ -GENL_op(DRBD_ADM_NEW_MINOR, 5, GENL_doit(drbd_adm_add_minor), - GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) -GENL_op(DRBD_ADM_DEL_MINOR, 6, GENL_doit(drbd_adm_delete_minor), - GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) - - /* add or delete resources */ -GENL_op(DRBD_ADM_NEW_RESOURCE, 7, GENL_doit(drbd_adm_new_resource), - GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) -GENL_op(DRBD_ADM_DEL_RESOURCE, 8, GENL_doit(drbd_adm_del_resource), - GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) - -GENL_op(DRBD_ADM_RESOURCE_OPTS, 9, - GENL_doit(drbd_adm_resource_opts), - GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) - GENL_tla_expected(DRBD_NLA_RESOURCE_OPTS, DRBD_GENLA_F_MANDATORY) -) - -GENL_op( - DRBD_ADM_CONNECT, 10, - GENL_doit(drbd_adm_connect), - GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) - GENL_tla_expected(DRBD_NLA_NET_CONF, DRBD_F_REQUIRED) -) - -GENL_op( - DRBD_ADM_CHG_NET_OPTS, 29, - GENL_doit(drbd_adm_net_opts), - GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) - GENL_tla_expected(DRBD_NLA_NET_CONF, DRBD_F_REQUIRED) -) - -GENL_op(DRBD_ADM_DISCONNECT, 11, GENL_doit(drbd_adm_disconnect), - GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) - -GENL_op(DRBD_ADM_ATTACH, 12, - GENL_doit(drbd_adm_attach), - GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) - GENL_tla_expected(DRBD_NLA_DISK_CONF, DRBD_F_REQUIRED) -) - -GENL_op(DRBD_ADM_CHG_DISK_OPTS, 28, - GENL_doit(drbd_adm_disk_opts), - GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) - GENL_tla_expected(DRBD_NLA_DISK_OPTS, DRBD_F_REQUIRED) -) - -GENL_op( - DRBD_ADM_RESIZE, 13, - GENL_doit(drbd_adm_resize), - GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) - GENL_tla_expected(DRBD_NLA_RESIZE_PARMS, DRBD_GENLA_F_MANDATORY) -) - -GENL_op( - DRBD_ADM_PRIMARY, 14, - GENL_doit(drbd_adm_set_role), - GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) - GENL_tla_expected(DRBD_NLA_SET_ROLE_PARMS, DRBD_F_REQUIRED) -) - -GENL_op( - DRBD_ADM_SECONDARY, 15, - GENL_doit(drbd_adm_set_role), - GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) - GENL_tla_expected(DRBD_NLA_SET_ROLE_PARMS, DRBD_F_REQUIRED) -) - -GENL_op( - DRBD_ADM_NEW_C_UUID, 16, - GENL_doit(drbd_adm_new_c_uuid), - GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) - GENL_tla_expected(DRBD_NLA_NEW_C_UUID_PARMS, DRBD_GENLA_F_MANDATORY) -) - -GENL_op( - DRBD_ADM_START_OV, 17, - GENL_doit(drbd_adm_start_ov), - GENL_tla_expected(DRBD_NLA_START_OV_PARMS, DRBD_GENLA_F_MANDATORY) -) - -GENL_op(DRBD_ADM_DETACH, 18, GENL_doit(drbd_adm_detach), - GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) - GENL_tla_expected(DRBD_NLA_DETACH_PARMS, DRBD_GENLA_F_MANDATORY)) - -GENL_op(DRBD_ADM_INVALIDATE, 19, GENL_doit(drbd_adm_invalidate), - GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) -GENL_op(DRBD_ADM_INVAL_PEER, 20, GENL_doit(drbd_adm_invalidate_peer), - GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) -GENL_op(DRBD_ADM_PAUSE_SYNC, 21, GENL_doit(drbd_adm_pause_sync), - GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) -GENL_op(DRBD_ADM_RESUME_SYNC, 22, GENL_doit(drbd_adm_resume_sync), - GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) -GENL_op(DRBD_ADM_SUSPEND_IO, 23, GENL_doit(drbd_adm_suspend_io), - GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) -GENL_op(DRBD_ADM_RESUME_IO, 24, GENL_doit(drbd_adm_resume_io), - GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) -GENL_op(DRBD_ADM_OUTDATE, 25, GENL_doit(drbd_adm_outdate), - GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) -GENL_op(DRBD_ADM_GET_TIMEOUT_TYPE, 26, GENL_doit(drbd_adm_get_timeout_type), - GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) -GENL_op(DRBD_ADM_DOWN, 27, GENL_doit(drbd_adm_down), - GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) diff --git a/trunk/include/linux/drbd_genl_api.h b/trunk/include/linux/drbd_genl_api.h deleted file mode 100644 index 9ef50d51e34e..000000000000 --- a/trunk/include/linux/drbd_genl_api.h +++ /dev/null @@ -1,55 +0,0 @@ -#ifndef DRBD_GENL_STRUCT_H -#define DRBD_GENL_STRUCT_H - -/** - * struct drbd_genlmsghdr - DRBD specific header used in NETLINK_GENERIC requests - * @minor: - * For admin requests (user -> kernel): which minor device to operate on. - * For (unicast) replies or informational (broadcast) messages - * (kernel -> user): which minor device the information is about. - * If we do not operate on minors, but on connections or resources, - * the minor value shall be (~0), and the attribute DRBD_NLA_CFG_CONTEXT - * is used instead. - * @flags: possible operation modifiers (relevant only for user->kernel): - * DRBD_GENL_F_SET_DEFAULTS - * @volume: - * When creating a new minor (adding it to a resource), the resource needs - * to know which volume number within the resource this is supposed to be. - * The volume number corresponds to the same volume number on the remote side, - * whereas the minor number on the remote side may be different - * (union with flags). - * @ret_code: kernel->userland unicast cfg reply return code (union with flags); - */ -struct drbd_genlmsghdr { - __u32 minor; - union { - __u32 flags; - __s32 ret_code; - }; -}; - -/* To be used in drbd_genlmsghdr.flags */ -enum { - DRBD_GENL_F_SET_DEFAULTS = 1, -}; - -enum drbd_state_info_bcast_reason { - SIB_GET_STATUS_REPLY = 1, - SIB_STATE_CHANGE = 2, - SIB_HELPER_PRE = 3, - SIB_HELPER_POST = 4, - SIB_SYNC_PROGRESS = 5, -}; - -/* hack around predefined gcc/cpp "linux=1", - * we cannot possibly include <1/drbd_genl.h> */ -#undef linux - -#include -#define GENL_MAGIC_VERSION API_VERSION -#define GENL_MAGIC_FAMILY drbd -#define GENL_MAGIC_FAMILY_HDRSZ sizeof(struct drbd_genlmsghdr) -#define GENL_MAGIC_INCLUDE_FILE -#include - -#endif diff --git a/trunk/include/linux/drbd_limits.h b/trunk/include/linux/drbd_limits.h index 1fa19c5f5e64..fb670bf603f7 100644 --- a/trunk/include/linux/drbd_limits.h +++ b/trunk/include/linux/drbd_limits.h @@ -16,37 +16,29 @@ #define DEBUG_RANGE_CHECK 0 #define DRBD_MINOR_COUNT_MIN 1 -#define DRBD_MINOR_COUNT_MAX 255 +#define DRBD_MINOR_COUNT_MAX 256 #define DRBD_MINOR_COUNT_DEF 32 -#define DRBD_MINOR_COUNT_SCALE '1' - -#define DRBD_VOLUME_MAX 65535 #define DRBD_DIALOG_REFRESH_MIN 0 #define DRBD_DIALOG_REFRESH_MAX 600 -#define DRBD_DIALOG_REFRESH_SCALE '1' /* valid port number */ #define DRBD_PORT_MIN 1 #define DRBD_PORT_MAX 0xffff -#define DRBD_PORT_SCALE '1' /* startup { */ /* if you want more than 3.4 days, disable */ #define DRBD_WFC_TIMEOUT_MIN 0 #define DRBD_WFC_TIMEOUT_MAX 300000 #define DRBD_WFC_TIMEOUT_DEF 0 -#define DRBD_WFC_TIMEOUT_SCALE '1' #define DRBD_DEGR_WFC_TIMEOUT_MIN 0 #define DRBD_DEGR_WFC_TIMEOUT_MAX 300000 #define DRBD_DEGR_WFC_TIMEOUT_DEF 0 -#define DRBD_DEGR_WFC_TIMEOUT_SCALE '1' #define DRBD_OUTDATED_WFC_TIMEOUT_MIN 0 #define DRBD_OUTDATED_WFC_TIMEOUT_MAX 300000 #define DRBD_OUTDATED_WFC_TIMEOUT_DEF 0 -#define DRBD_OUTDATED_WFC_TIMEOUT_SCALE '1' /* }*/ /* net { */ @@ -55,91 +47,75 @@ #define DRBD_TIMEOUT_MIN 1 #define DRBD_TIMEOUT_MAX 600 #define DRBD_TIMEOUT_DEF 60 /* 6 seconds */ -#define DRBD_TIMEOUT_SCALE '1' /* If backing disk takes longer than disk_timeout, mark the disk as failed */ #define DRBD_DISK_TIMEOUT_MIN 0 /* 0 = disabled */ #define DRBD_DISK_TIMEOUT_MAX 6000 /* 10 Minutes */ #define DRBD_DISK_TIMEOUT_DEF 0 /* disabled */ -#define DRBD_DISK_TIMEOUT_SCALE '1' /* active connection retries when C_WF_CONNECTION */ #define DRBD_CONNECT_INT_MIN 1 #define DRBD_CONNECT_INT_MAX 120 #define DRBD_CONNECT_INT_DEF 10 /* seconds */ -#define DRBD_CONNECT_INT_SCALE '1' /* keep-alive probes when idle */ #define DRBD_PING_INT_MIN 1 #define DRBD_PING_INT_MAX 120 #define DRBD_PING_INT_DEF 10 -#define DRBD_PING_INT_SCALE '1' /* timeout for the ping packets.*/ #define DRBD_PING_TIMEO_MIN 1 #define DRBD_PING_TIMEO_MAX 300 #define DRBD_PING_TIMEO_DEF 5 -#define DRBD_PING_TIMEO_SCALE '1' /* max number of write requests between write barriers */ #define DRBD_MAX_EPOCH_SIZE_MIN 1 #define DRBD_MAX_EPOCH_SIZE_MAX 20000 #define DRBD_MAX_EPOCH_SIZE_DEF 2048 -#define DRBD_MAX_EPOCH_SIZE_SCALE '1' /* I don't think that a tcp send buffer of more than 10M is useful */ #define DRBD_SNDBUF_SIZE_MIN 0 #define DRBD_SNDBUF_SIZE_MAX (10<<20) #define DRBD_SNDBUF_SIZE_DEF 0 -#define DRBD_SNDBUF_SIZE_SCALE '1' #define DRBD_RCVBUF_SIZE_MIN 0 #define DRBD_RCVBUF_SIZE_MAX (10<<20) #define DRBD_RCVBUF_SIZE_DEF 0 -#define DRBD_RCVBUF_SIZE_SCALE '1' /* @4k PageSize -> 128kB - 512MB */ #define DRBD_MAX_BUFFERS_MIN 32 #define DRBD_MAX_BUFFERS_MAX 131072 #define DRBD_MAX_BUFFERS_DEF 2048 -#define DRBD_MAX_BUFFERS_SCALE '1' /* @4k PageSize -> 4kB - 512MB */ #define DRBD_UNPLUG_WATERMARK_MIN 1 #define DRBD_UNPLUG_WATERMARK_MAX 131072 #define DRBD_UNPLUG_WATERMARK_DEF (DRBD_MAX_BUFFERS_DEF/16) -#define DRBD_UNPLUG_WATERMARK_SCALE '1' /* 0 is disabled. * 200 should be more than enough even for very short timeouts */ #define DRBD_KO_COUNT_MIN 0 #define DRBD_KO_COUNT_MAX 200 -#define DRBD_KO_COUNT_DEF 7 -#define DRBD_KO_COUNT_SCALE '1' +#define DRBD_KO_COUNT_DEF 0 /* } */ /* syncer { */ /* FIXME allow rate to be zero? */ -#define DRBD_RESYNC_RATE_MIN 1 +#define DRBD_RATE_MIN 1 /* channel bonding 10 GbE, or other hardware */ -#define DRBD_RESYNC_RATE_MAX (4 << 20) -#define DRBD_RESYNC_RATE_DEF 250 -#define DRBD_RESYNC_RATE_SCALE 'k' /* kilobytes */ +#define DRBD_RATE_MAX (4 << 20) +#define DRBD_RATE_DEF 250 /* kb/second */ /* less than 7 would hit performance unnecessarily. - * 919 slots context information per transaction, - * 32k activity log, 4k transaction size, - * one transaction in flight: - * 919 * 7 = 6433 */ + * 3833 is the largest prime that still does fit + * into 64 sectors of activity log */ #define DRBD_AL_EXTENTS_MIN 7 -#define DRBD_AL_EXTENTS_MAX 6433 -#define DRBD_AL_EXTENTS_DEF 1237 -#define DRBD_AL_EXTENTS_SCALE '1' +#define DRBD_AL_EXTENTS_MAX 3833 +#define DRBD_AL_EXTENTS_DEF 127 -#define DRBD_MINOR_NUMBER_MIN -1 -#define DRBD_MINOR_NUMBER_MAX ((1 << 20) - 1) -#define DRBD_MINOR_NUMBER_DEF -1 -#define DRBD_MINOR_NUMBER_SCALE '1' +#define DRBD_AFTER_MIN -1 +#define DRBD_AFTER_MAX 255 +#define DRBD_AFTER_DEF -1 /* } */ @@ -148,12 +124,11 @@ * the upper limit with 64bit kernel, enough ram and flexible meta data * is 1 PiB, currently. */ /* DRBD_MAX_SECTORS */ -#define DRBD_DISK_SIZE_MIN 0 -#define DRBD_DISK_SIZE_MAX (1 * (2LLU << 40)) -#define DRBD_DISK_SIZE_DEF 0 /* = disabled = no user size... */ -#define DRBD_DISK_SIZE_SCALE 's' /* sectors */ +#define DRBD_DISK_SIZE_SECT_MIN 0 +#define DRBD_DISK_SIZE_SECT_MAX (1 * (2LLU << 40)) +#define DRBD_DISK_SIZE_SECT_DEF 0 /* = disabled = no user size... */ -#define DRBD_ON_IO_ERROR_DEF EP_DETACH +#define DRBD_ON_IO_ERROR_DEF EP_PASS_ON #define DRBD_FENCING_DEF FP_DONT_CARE #define DRBD_AFTER_SB_0P_DEF ASB_DISCONNECT #define DRBD_AFTER_SB_1P_DEF ASB_DISCONNECT @@ -161,59 +136,38 @@ #define DRBD_RR_CONFLICT_DEF ASB_DISCONNECT #define DRBD_ON_NO_DATA_DEF OND_IO_ERROR #define DRBD_ON_CONGESTION_DEF OC_BLOCK -#define DRBD_READ_BALANCING_DEF RB_PREFER_LOCAL #define DRBD_MAX_BIO_BVECS_MIN 0 #define DRBD_MAX_BIO_BVECS_MAX 128 #define DRBD_MAX_BIO_BVECS_DEF 0 -#define DRBD_MAX_BIO_BVECS_SCALE '1' #define DRBD_C_PLAN_AHEAD_MIN 0 #define DRBD_C_PLAN_AHEAD_MAX 300 -#define DRBD_C_PLAN_AHEAD_DEF 20 -#define DRBD_C_PLAN_AHEAD_SCALE '1' +#define DRBD_C_PLAN_AHEAD_DEF 0 /* RS rate controller disabled by default */ #define DRBD_C_DELAY_TARGET_MIN 1 #define DRBD_C_DELAY_TARGET_MAX 100 #define DRBD_C_DELAY_TARGET_DEF 10 -#define DRBD_C_DELAY_TARGET_SCALE '1' #define DRBD_C_FILL_TARGET_MIN 0 #define DRBD_C_FILL_TARGET_MAX (1<<20) /* 500MByte in sec */ -#define DRBD_C_FILL_TARGET_DEF 100 /* Try to place 50KiB in socket send buffer during resync */ -#define DRBD_C_FILL_TARGET_SCALE 's' /* sectors */ +#define DRBD_C_FILL_TARGET_DEF 0 /* By default disabled -> controlled by delay_target */ -#define DRBD_C_MAX_RATE_MIN 250 +#define DRBD_C_MAX_RATE_MIN 250 /* kByte/sec */ #define DRBD_C_MAX_RATE_MAX (4 << 20) #define DRBD_C_MAX_RATE_DEF 102400 -#define DRBD_C_MAX_RATE_SCALE 'k' /* kilobytes */ -#define DRBD_C_MIN_RATE_MIN 0 +#define DRBD_C_MIN_RATE_MIN 0 /* kByte/sec */ #define DRBD_C_MIN_RATE_MAX (4 << 20) -#define DRBD_C_MIN_RATE_DEF 250 -#define DRBD_C_MIN_RATE_SCALE 'k' /* kilobytes */ +#define DRBD_C_MIN_RATE_DEF 4096 #define DRBD_CONG_FILL_MIN 0 #define DRBD_CONG_FILL_MAX (10<<21) /* 10GByte in sectors */ #define DRBD_CONG_FILL_DEF 0 -#define DRBD_CONG_FILL_SCALE 's' /* sectors */ #define DRBD_CONG_EXTENTS_MIN DRBD_AL_EXTENTS_MIN #define DRBD_CONG_EXTENTS_MAX DRBD_AL_EXTENTS_MAX #define DRBD_CONG_EXTENTS_DEF DRBD_AL_EXTENTS_DEF -#define DRBD_CONG_EXTENTS_SCALE DRBD_AL_EXTENTS_SCALE - -#define DRBD_PROTOCOL_DEF DRBD_PROT_C - -#define DRBD_DISK_BARRIER_DEF 0 -#define DRBD_DISK_FLUSHES_DEF 1 -#define DRBD_DISK_DRAIN_DEF 1 -#define DRBD_MD_FLUSHES_DEF 1 -#define DRBD_TCP_CORK_DEF 1 -#define DRBD_AL_UPDATES_DEF 1 - -#define DRBD_ALLOW_TWO_PRIMARIES_DEF 0 -#define DRBD_ALWAYS_ASBP_DEF 0 -#define DRBD_USE_RLE_DEF 1 +#undef RANGE #endif diff --git a/trunk/include/linux/drbd_nl.h b/trunk/include/linux/drbd_nl.h new file mode 100644 index 000000000000..a8706f08ab36 --- /dev/null +++ b/trunk/include/linux/drbd_nl.h @@ -0,0 +1,163 @@ +/* + PAKET( name, + TYPE ( pn, pr, member ) + ... + ) + + You may never reissue one of the pn arguments +*/ + +#if !defined(NL_PACKET) || !defined(NL_STRING) || !defined(NL_INTEGER) || !defined(NL_BIT) || !defined(NL_INT64) +#error "The macros NL_PACKET, NL_STRING, NL_INTEGER, NL_INT64 and NL_BIT needs to be defined" +#endif + +NL_PACKET(primary, 1, + NL_BIT( 1, T_MAY_IGNORE, primary_force) +) + +NL_PACKET(secondary, 2, ) + +NL_PACKET(disk_conf, 3, + NL_INT64( 2, T_MAY_IGNORE, disk_size) + NL_STRING( 3, T_MANDATORY, backing_dev, 128) + NL_STRING( 4, T_MANDATORY, meta_dev, 128) + NL_INTEGER( 5, T_MANDATORY, meta_dev_idx) + NL_INTEGER( 6, T_MAY_IGNORE, on_io_error) + NL_INTEGER( 7, T_MAY_IGNORE, fencing) + NL_BIT( 37, T_MAY_IGNORE, use_bmbv) + NL_BIT( 53, T_MAY_IGNORE, no_disk_flush) + NL_BIT( 54, T_MAY_IGNORE, no_md_flush) + /* 55 max_bio_size was available in 8.2.6rc2 */ + NL_INTEGER( 56, T_MAY_IGNORE, max_bio_bvecs) + NL_BIT( 57, T_MAY_IGNORE, no_disk_barrier) + NL_BIT( 58, T_MAY_IGNORE, no_disk_drain) + NL_INTEGER( 89, T_MAY_IGNORE, disk_timeout) +) + +NL_PACKET(detach, 4, + NL_BIT( 88, T_MANDATORY, detach_force) +) + +NL_PACKET(net_conf, 5, + NL_STRING( 8, T_MANDATORY, my_addr, 128) + NL_STRING( 9, T_MANDATORY, peer_addr, 128) + NL_STRING( 10, T_MAY_IGNORE, shared_secret, SHARED_SECRET_MAX) + NL_STRING( 11, T_MAY_IGNORE, cram_hmac_alg, SHARED_SECRET_MAX) + NL_STRING( 44, T_MAY_IGNORE, integrity_alg, SHARED_SECRET_MAX) + NL_INTEGER( 14, T_MAY_IGNORE, timeout) + NL_INTEGER( 15, T_MANDATORY, wire_protocol) + NL_INTEGER( 16, T_MAY_IGNORE, try_connect_int) + NL_INTEGER( 17, T_MAY_IGNORE, ping_int) + NL_INTEGER( 18, T_MAY_IGNORE, max_epoch_size) + NL_INTEGER( 19, T_MAY_IGNORE, max_buffers) + NL_INTEGER( 20, T_MAY_IGNORE, unplug_watermark) + NL_INTEGER( 21, T_MAY_IGNORE, sndbuf_size) + NL_INTEGER( 22, T_MAY_IGNORE, ko_count) + NL_INTEGER( 24, T_MAY_IGNORE, after_sb_0p) + NL_INTEGER( 25, T_MAY_IGNORE, after_sb_1p) + NL_INTEGER( 26, T_MAY_IGNORE, after_sb_2p) + NL_INTEGER( 39, T_MAY_IGNORE, rr_conflict) + NL_INTEGER( 40, T_MAY_IGNORE, ping_timeo) + NL_INTEGER( 67, T_MAY_IGNORE, rcvbuf_size) + NL_INTEGER( 81, T_MAY_IGNORE, on_congestion) + NL_INTEGER( 82, T_MAY_IGNORE, cong_fill) + NL_INTEGER( 83, T_MAY_IGNORE, cong_extents) + /* 59 addr_family was available in GIT, never released */ + NL_BIT( 60, T_MANDATORY, mind_af) + NL_BIT( 27, T_MAY_IGNORE, want_lose) + NL_BIT( 28, T_MAY_IGNORE, two_primaries) + NL_BIT( 41, T_MAY_IGNORE, always_asbp) + NL_BIT( 61, T_MAY_IGNORE, no_cork) + NL_BIT( 62, T_MANDATORY, auto_sndbuf_size) + NL_BIT( 70, T_MANDATORY, dry_run) +) + +NL_PACKET(disconnect, 6, + NL_BIT( 84, T_MAY_IGNORE, force) +) + +NL_PACKET(resize, 7, + NL_INT64( 29, T_MAY_IGNORE, resize_size) + NL_BIT( 68, T_MAY_IGNORE, resize_force) + NL_BIT( 69, T_MANDATORY, no_resync) +) + +NL_PACKET(syncer_conf, 8, + NL_INTEGER( 30, T_MAY_IGNORE, rate) + NL_INTEGER( 31, T_MAY_IGNORE, after) + NL_INTEGER( 32, T_MAY_IGNORE, al_extents) +/* NL_INTEGER( 71, T_MAY_IGNORE, dp_volume) + * NL_INTEGER( 72, T_MAY_IGNORE, dp_interval) + * NL_INTEGER( 73, T_MAY_IGNORE, throttle_th) + * NL_INTEGER( 74, T_MAY_IGNORE, hold_off_th) + * feature will be reimplemented differently with 8.3.9 */ + NL_STRING( 52, T_MAY_IGNORE, verify_alg, SHARED_SECRET_MAX) + NL_STRING( 51, T_MAY_IGNORE, cpu_mask, 32) + NL_STRING( 64, T_MAY_IGNORE, csums_alg, SHARED_SECRET_MAX) + NL_BIT( 65, T_MAY_IGNORE, use_rle) + NL_INTEGER( 75, T_MAY_IGNORE, on_no_data) + NL_INTEGER( 76, T_MAY_IGNORE, c_plan_ahead) + NL_INTEGER( 77, T_MAY_IGNORE, c_delay_target) + NL_INTEGER( 78, T_MAY_IGNORE, c_fill_target) + NL_INTEGER( 79, T_MAY_IGNORE, c_max_rate) + NL_INTEGER( 80, T_MAY_IGNORE, c_min_rate) +) + +NL_PACKET(invalidate, 9, ) +NL_PACKET(invalidate_peer, 10, ) +NL_PACKET(pause_sync, 11, ) +NL_PACKET(resume_sync, 12, ) +NL_PACKET(suspend_io, 13, ) +NL_PACKET(resume_io, 14, ) +NL_PACKET(outdate, 15, ) +NL_PACKET(get_config, 16, ) +NL_PACKET(get_state, 17, + NL_INTEGER( 33, T_MAY_IGNORE, state_i) +) + +NL_PACKET(get_uuids, 18, + NL_STRING( 34, T_MAY_IGNORE, uuids, (UI_SIZE*sizeof(__u64))) + NL_INTEGER( 35, T_MAY_IGNORE, uuids_flags) +) + +NL_PACKET(get_timeout_flag, 19, + NL_BIT( 36, T_MAY_IGNORE, use_degraded) +) + +NL_PACKET(call_helper, 20, + NL_STRING( 38, T_MAY_IGNORE, helper, 32) +) + +/* Tag nr 42 already allocated in drbd-8.1 development. */ + +NL_PACKET(sync_progress, 23, + NL_INTEGER( 43, T_MAY_IGNORE, sync_progress) +) + +NL_PACKET(dump_ee, 24, + NL_STRING( 45, T_MAY_IGNORE, dump_ee_reason, 32) + NL_STRING( 46, T_MAY_IGNORE, seen_digest, SHARED_SECRET_MAX) + NL_STRING( 47, T_MAY_IGNORE, calc_digest, SHARED_SECRET_MAX) + NL_INT64( 48, T_MAY_IGNORE, ee_sector) + NL_INT64( 49, T_MAY_IGNORE, ee_block_id) + NL_STRING( 50, T_MAY_IGNORE, ee_data, 32 << 10) +) + +NL_PACKET(start_ov, 25, + NL_INT64( 66, T_MAY_IGNORE, start_sector) +) + +NL_PACKET(new_c_uuid, 26, + NL_BIT( 63, T_MANDATORY, clear_bm) +) + +#ifdef NL_RESPONSE +NL_RESPONSE(return_code_only, 27) +#endif + +#undef NL_PACKET +#undef NL_INTEGER +#undef NL_INT64 +#undef NL_BIT +#undef NL_STRING +#undef NL_RESPONSE diff --git a/trunk/include/linux/drbd_tag_magic.h b/trunk/include/linux/drbd_tag_magic.h new file mode 100644 index 000000000000..82de1f9e48b1 --- /dev/null +++ b/trunk/include/linux/drbd_tag_magic.h @@ -0,0 +1,84 @@ +#ifndef DRBD_TAG_MAGIC_H +#define DRBD_TAG_MAGIC_H + +#define TT_END 0 +#define TT_REMOVED 0xE000 + +/* declare packet_type enums */ +enum packet_types { +#define NL_PACKET(name, number, fields) P_ ## name = number, +#define NL_RESPONSE(name, number) P_ ## name = number, +#define NL_INTEGER(pn, pr, member) +#define NL_INT64(pn, pr, member) +#define NL_BIT(pn, pr, member) +#define NL_STRING(pn, pr, member, len) +#include + P_nl_after_last_packet, +}; + +/* These struct are used to deduce the size of the tag lists: */ +#define NL_PACKET(name, number, fields) \ + struct name ## _tag_len_struct { fields }; +#define NL_INTEGER(pn, pr, member) \ + int member; int tag_and_len ## member; +#define NL_INT64(pn, pr, member) \ + __u64 member; int tag_and_len ## member; +#define NL_BIT(pn, pr, member) \ + unsigned char member:1; int tag_and_len ## member; +#define NL_STRING(pn, pr, member, len) \ + unsigned char member[len]; int member ## _len; \ + int tag_and_len ## member; +#include + +/* declare tag-list-sizes */ +static const int tag_list_sizes[] = { +#define NL_PACKET(name, number, fields) 2 fields , +#define NL_INTEGER(pn, pr, member) + 4 + 4 +#define NL_INT64(pn, pr, member) + 4 + 8 +#define NL_BIT(pn, pr, member) + 4 + 1 +#define NL_STRING(pn, pr, member, len) + 4 + (len) +#include +}; + +/* The two highest bits are used for the tag type */ +#define TT_MASK 0xC000 +#define TT_INTEGER 0x0000 +#define TT_INT64 0x4000 +#define TT_BIT 0x8000 +#define TT_STRING 0xC000 +/* The next bit indicates if processing of the tag is mandatory */ +#define T_MANDATORY 0x2000 +#define T_MAY_IGNORE 0x0000 +#define TN_MASK 0x1fff +/* The remaining 13 bits are used to enumerate the tags */ + +#define tag_type(T) ((T) & TT_MASK) +#define tag_number(T) ((T) & TN_MASK) + +/* declare tag enums */ +#define NL_PACKET(name, number, fields) fields +enum drbd_tags { +#define NL_INTEGER(pn, pr, member) T_ ## member = pn | TT_INTEGER | pr , +#define NL_INT64(pn, pr, member) T_ ## member = pn | TT_INT64 | pr , +#define NL_BIT(pn, pr, member) T_ ## member = pn | TT_BIT | pr , +#define NL_STRING(pn, pr, member, len) T_ ## member = pn | TT_STRING | pr , +#include +}; + +struct tag { + const char *name; + int type_n_flags; + int max_len; +}; + +/* declare tag names */ +#define NL_PACKET(name, number, fields) fields +static const struct tag tag_descriptions[] = { +#define NL_INTEGER(pn, pr, member) [ pn ] = { #member, TT_INTEGER | pr, sizeof(int) }, +#define NL_INT64(pn, pr, member) [ pn ] = { #member, TT_INT64 | pr, sizeof(__u64) }, +#define NL_BIT(pn, pr, member) [ pn ] = { #member, TT_BIT | pr, sizeof(int) }, +#define NL_STRING(pn, pr, member, len) [ pn ] = { #member, TT_STRING | pr, (len) }, +#include +}; + +#endif diff --git a/trunk/include/linux/fs.h b/trunk/include/linux/fs.h index 035521b46528..408fb1e77a0a 100644 --- a/trunk/include/linux/fs.h +++ b/trunk/include/linux/fs.h @@ -1810,8 +1810,6 @@ struct file_system_type { #define FS_REQUIRES_DEV 1 #define FS_BINARY_MOUNTDATA 2 #define FS_HAS_SUBTYPE 4 -#define FS_USERNS_MOUNT 8 /* Can be mounted by userns root */ -#define FS_USERNS_DEV_MOUNT 16 /* A userns mount does not imply MNT_NODEV */ #define FS_REVAL_DOT 16384 /* Check the paths ".", ".." for staleness */ #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ struct dentry *(*mount) (struct file_system_type *, int, diff --git a/trunk/include/linux/genhd.h b/trunk/include/linux/genhd.h index 79b8bba19363..4f440b3e89fe 100644 --- a/trunk/include/linux/genhd.h +++ b/trunk/include/linux/genhd.h @@ -88,14 +88,10 @@ struct disk_stats { }; #define PARTITION_META_INFO_VOLNAMELTH 64 -/* - * Enough for the string representation of any kind of UUID plus NULL. - * EFI UUID is 36 characters. MSDOS UUID is 11 characters. - */ -#define PARTITION_META_INFO_UUIDLTH 37 +#define PARTITION_META_INFO_UUIDLTH 16 struct partition_meta_info { - char uuid[PARTITION_META_INFO_UUIDLTH]; + u8 uuid[PARTITION_META_INFO_UUIDLTH]; /* always big endian */ u8 volname[PARTITION_META_INFO_VOLNAMELTH]; }; diff --git a/trunk/include/linux/genl_magic_func.h b/trunk/include/linux/genl_magic_func.h deleted file mode 100644 index 023bc346b877..000000000000 --- a/trunk/include/linux/genl_magic_func.h +++ /dev/null @@ -1,422 +0,0 @@ -#ifndef GENL_MAGIC_FUNC_H -#define GENL_MAGIC_FUNC_H - -#include - -/* - * Magic: declare tla policy {{{1 - * Magic: declare nested policies - * {{{2 - */ -#undef GENL_mc_group -#define GENL_mc_group(group) - -#undef GENL_notification -#define GENL_notification(op_name, op_num, mcast_group, tla_list) - -#undef GENL_op -#define GENL_op(op_name, op_num, handler, tla_list) - -#undef GENL_struct -#define GENL_struct(tag_name, tag_number, s_name, s_fields) \ - [tag_name] = { .type = NLA_NESTED }, - -static struct nla_policy CONCAT_(GENL_MAGIC_FAMILY, _tla_nl_policy)[] = { -#include GENL_MAGIC_INCLUDE_FILE -}; - -#undef GENL_struct -#define GENL_struct(tag_name, tag_number, s_name, s_fields) \ -static struct nla_policy s_name ## _nl_policy[] __read_mostly = \ -{ s_fields }; - -#undef __field -#define __field(attr_nr, attr_flag, name, nla_type, _type, __get, \ - __put, __is_signed) \ - [attr_nr] = { .type = nla_type }, - -#undef __array -#define __array(attr_nr, attr_flag, name, nla_type, _type, maxlen, \ - __get, __put, __is_signed) \ - [attr_nr] = { .type = nla_type, \ - .len = maxlen - (nla_type == NLA_NUL_STRING) }, - -#include GENL_MAGIC_INCLUDE_FILE - -#ifndef __KERNEL__ -#ifndef pr_info -#define pr_info(args...) fprintf(stderr, args); -#endif -#endif - -#ifdef GENL_MAGIC_DEBUG -static void dprint_field(const char *dir, int nla_type, - const char *name, void *valp) -{ - __u64 val = valp ? *(__u32 *)valp : 1; - switch (nla_type) { - case NLA_U8: val = (__u8)val; - case NLA_U16: val = (__u16)val; - case NLA_U32: val = (__u32)val; - pr_info("%s attr %s: %d 0x%08x\n", dir, - name, (int)val, (unsigned)val); - break; - case NLA_U64: - val = *(__u64*)valp; - pr_info("%s attr %s: %lld 0x%08llx\n", dir, - name, (long long)val, (unsigned long long)val); - break; - case NLA_FLAG: - if (val) - pr_info("%s attr %s: set\n", dir, name); - break; - } -} - -static void dprint_array(const char *dir, int nla_type, - const char *name, const char *val, unsigned len) -{ - switch (nla_type) { - case NLA_NUL_STRING: - if (len && val[len-1] == '\0') - len--; - pr_info("%s attr %s: [len:%u] '%s'\n", dir, name, len, val); - break; - default: - /* we can always show 4 byte, - * thats what nlattr are aligned to. */ - pr_info("%s attr %s: [len:%u] %02x%02x%02x%02x ...\n", - dir, name, len, val[0], val[1], val[2], val[3]); - } -} - -#define DPRINT_TLA(a, op, b) pr_info("%s %s %s\n", a, op, b); - -/* Name is a member field name of the struct s. - * If s is NULL (only parsing, no copy requested in *_from_attrs()), - * nla is supposed to point to the attribute containing the information - * corresponding to that struct member. */ -#define DPRINT_FIELD(dir, nla_type, name, s, nla) \ - do { \ - if (s) \ - dprint_field(dir, nla_type, #name, &s->name); \ - else if (nla) \ - dprint_field(dir, nla_type, #name, \ - (nla_type == NLA_FLAG) ? NULL \ - : nla_data(nla)); \ - } while (0) - -#define DPRINT_ARRAY(dir, nla_type, name, s, nla) \ - do { \ - if (s) \ - dprint_array(dir, nla_type, #name, \ - s->name, s->name ## _len); \ - else if (nla) \ - dprint_array(dir, nla_type, #name, \ - nla_data(nla), nla_len(nla)); \ - } while (0) -#else -#define DPRINT_TLA(a, op, b) do {} while (0) -#define DPRINT_FIELD(dir, nla_type, name, s, nla) do {} while (0) -#define DPRINT_ARRAY(dir, nla_type, name, s, nla) do {} while (0) -#endif - -/* - * Magic: provide conversion functions {{{1 - * populate struct from attribute table: - * {{{2 - */ - -/* processing of generic netlink messages is serialized. - * use one static buffer for parsing of nested attributes */ -static struct nlattr *nested_attr_tb[128]; - -#ifndef BUILD_BUG_ON -/* Force a compilation error if condition is true */ -#define BUILD_BUG_ON(condition) ((void)BUILD_BUG_ON_ZERO(condition)) -/* Force a compilation error if condition is true, but also produce a - result (of value 0 and type size_t), so the expression can be used - e.g. in a structure initializer (or where-ever else comma expressions - aren't permitted). */ -#define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:-!!(e); })) -#define BUILD_BUG_ON_NULL(e) ((void *)sizeof(struct { int:-!!(e); })) -#endif - -#undef GENL_struct -#define GENL_struct(tag_name, tag_number, s_name, s_fields) \ -/* *_from_attrs functions are static, but potentially unused */ \ -static int __ ## s_name ## _from_attrs(struct s_name *s, \ - struct genl_info *info, bool exclude_invariants) \ -{ \ - const int maxtype = ARRAY_SIZE(s_name ## _nl_policy)-1; \ - struct nlattr *tla = info->attrs[tag_number]; \ - struct nlattr **ntb = nested_attr_tb; \ - struct nlattr *nla; \ - int err; \ - BUILD_BUG_ON(ARRAY_SIZE(s_name ## _nl_policy) > ARRAY_SIZE(nested_attr_tb)); \ - if (!tla) \ - return -ENOMSG; \ - DPRINT_TLA(#s_name, "<=-", #tag_name); \ - err = drbd_nla_parse_nested(ntb, maxtype, tla, s_name ## _nl_policy); \ - if (err) \ - return err; \ - \ - s_fields \ - return 0; \ -} __attribute__((unused)) \ -static int s_name ## _from_attrs(struct s_name *s, \ - struct genl_info *info) \ -{ \ - return __ ## s_name ## _from_attrs(s, info, false); \ -} __attribute__((unused)) \ -static int s_name ## _from_attrs_for_change(struct s_name *s, \ - struct genl_info *info) \ -{ \ - return __ ## s_name ## _from_attrs(s, info, true); \ -} __attribute__((unused)) \ - -#define __assign(attr_nr, attr_flag, name, nla_type, type, assignment...) \ - nla = ntb[attr_nr]; \ - if (nla) { \ - if (exclude_invariants && ((attr_flag) & DRBD_F_INVARIANT)) { \ - pr_info("<< must not change invariant attr: %s\n", #name); \ - return -EEXIST; \ - } \ - assignment; \ - } else if (exclude_invariants && ((attr_flag) & DRBD_F_INVARIANT)) { \ - /* attribute missing from payload, */ \ - /* which was expected */ \ - } else if ((attr_flag) & DRBD_F_REQUIRED) { \ - pr_info("<< missing attr: %s\n", #name); \ - return -ENOMSG; \ - } - -#undef __field -#define __field(attr_nr, attr_flag, name, nla_type, type, __get, __put, \ - __is_signed) \ - __assign(attr_nr, attr_flag, name, nla_type, type, \ - if (s) \ - s->name = __get(nla); \ - DPRINT_FIELD("<<", nla_type, name, s, nla)) - -/* validate_nla() already checked nla_len <= maxlen appropriately. */ -#undef __array -#define __array(attr_nr, attr_flag, name, nla_type, type, maxlen, \ - __get, __put, __is_signed) \ - __assign(attr_nr, attr_flag, name, nla_type, type, \ - if (s) \ - s->name ## _len = \ - __get(s->name, nla, maxlen); \ - DPRINT_ARRAY("<<", nla_type, name, s, nla)) - -#include GENL_MAGIC_INCLUDE_FILE - -#undef GENL_struct -#define GENL_struct(tag_name, tag_number, s_name, s_fields) - -/* - * Magic: define op number to op name mapping {{{1 - * {{{2 - */ -const char *CONCAT_(GENL_MAGIC_FAMILY, _genl_cmd_to_str)(__u8 cmd) -{ - switch (cmd) { -#undef GENL_op -#define GENL_op(op_name, op_num, handler, tla_list) \ - case op_num: return #op_name; -#include GENL_MAGIC_INCLUDE_FILE - default: - return "unknown"; - } -} - -#ifdef __KERNEL__ -#include -/* - * Magic: define genl_ops {{{1 - * {{{2 - */ - -#undef GENL_op -#define GENL_op(op_name, op_num, handler, tla_list) \ -{ \ - handler \ - .cmd = op_name, \ - .policy = CONCAT_(GENL_MAGIC_FAMILY, _tla_nl_policy), \ -}, - -#define ZZZ_genl_ops CONCAT_(GENL_MAGIC_FAMILY, _genl_ops) -static struct genl_ops ZZZ_genl_ops[] __read_mostly = { -#include GENL_MAGIC_INCLUDE_FILE -}; - -#undef GENL_op -#define GENL_op(op_name, op_num, handler, tla_list) - -/* - * Define the genl_family, multicast groups, {{{1 - * and provide register/unregister functions. - * {{{2 - */ -#define ZZZ_genl_family CONCAT_(GENL_MAGIC_FAMILY, _genl_family) -static struct genl_family ZZZ_genl_family __read_mostly = { - .id = GENL_ID_GENERATE, - .name = __stringify(GENL_MAGIC_FAMILY), - .version = GENL_MAGIC_VERSION, -#ifdef GENL_MAGIC_FAMILY_HDRSZ - .hdrsize = NLA_ALIGN(GENL_MAGIC_FAMILY_HDRSZ), -#endif - .maxattr = ARRAY_SIZE(drbd_tla_nl_policy)-1, -}; - -/* - * Magic: define multicast groups - * Magic: define multicast group registration helper - */ -#undef GENL_mc_group -#define GENL_mc_group(group) \ -static struct genl_multicast_group \ -CONCAT_(GENL_MAGIC_FAMILY, _mcg_ ## group) __read_mostly = { \ - .name = #group, \ -}; \ -static int CONCAT_(GENL_MAGIC_FAMILY, _genl_multicast_ ## group)( \ - struct sk_buff *skb, gfp_t flags) \ -{ \ - unsigned int group_id = \ - CONCAT_(GENL_MAGIC_FAMILY, _mcg_ ## group).id; \ - if (!group_id) \ - return -EINVAL; \ - return genlmsg_multicast(skb, 0, group_id, flags); \ -} - -#include GENL_MAGIC_INCLUDE_FILE - -int CONCAT_(GENL_MAGIC_FAMILY, _genl_register)(void) -{ - int err = genl_register_family_with_ops(&ZZZ_genl_family, - ZZZ_genl_ops, ARRAY_SIZE(ZZZ_genl_ops)); - if (err) - return err; -#undef GENL_mc_group -#define GENL_mc_group(group) \ - err = genl_register_mc_group(&ZZZ_genl_family, \ - &CONCAT_(GENL_MAGIC_FAMILY, _mcg_ ## group)); \ - if (err) \ - goto fail; \ - else \ - pr_info("%s: mcg %s: %u\n", #group, \ - __stringify(GENL_MAGIC_FAMILY), \ - CONCAT_(GENL_MAGIC_FAMILY, _mcg_ ## group).id); - -#include GENL_MAGIC_INCLUDE_FILE - -#undef GENL_mc_group -#define GENL_mc_group(group) - return 0; -fail: - genl_unregister_family(&ZZZ_genl_family); - return err; -} - -void CONCAT_(GENL_MAGIC_FAMILY, _genl_unregister)(void) -{ - genl_unregister_family(&ZZZ_genl_family); -} - -/* - * Magic: provide conversion functions {{{1 - * populate skb from struct. - * {{{2 - */ - -#undef GENL_op -#define GENL_op(op_name, op_num, handler, tla_list) - -#undef GENL_struct -#define GENL_struct(tag_name, tag_number, s_name, s_fields) \ -static int s_name ## _to_skb(struct sk_buff *skb, struct s_name *s, \ - const bool exclude_sensitive) \ -{ \ - struct nlattr *tla = nla_nest_start(skb, tag_number); \ - if (!tla) \ - goto nla_put_failure; \ - DPRINT_TLA(#s_name, "-=>", #tag_name); \ - s_fields \ - nla_nest_end(skb, tla); \ - return 0; \ - \ -nla_put_failure: \ - if (tla) \ - nla_nest_cancel(skb, tla); \ - return -EMSGSIZE; \ -} \ -static inline int s_name ## _to_priv_skb(struct sk_buff *skb, \ - struct s_name *s) \ -{ \ - return s_name ## _to_skb(skb, s, 0); \ -} \ -static inline int s_name ## _to_unpriv_skb(struct sk_buff *skb, \ - struct s_name *s) \ -{ \ - return s_name ## _to_skb(skb, s, 1); \ -} - - -#undef __field -#define __field(attr_nr, attr_flag, name, nla_type, type, __get, __put, \ - __is_signed) \ - if (!exclude_sensitive || !((attr_flag) & DRBD_F_SENSITIVE)) { \ - DPRINT_FIELD(">>", nla_type, name, s, NULL); \ - if (__put(skb, attr_nr, s->name)) \ - goto nla_put_failure; \ - } - -#undef __array -#define __array(attr_nr, attr_flag, name, nla_type, type, maxlen, \ - __get, __put, __is_signed) \ - if (!exclude_sensitive || !((attr_flag) & DRBD_F_SENSITIVE)) { \ - DPRINT_ARRAY(">>",nla_type, name, s, NULL); \ - if (__put(skb, attr_nr, min_t(int, maxlen, \ - s->name ## _len + (nla_type == NLA_NUL_STRING)),\ - s->name)) \ - goto nla_put_failure; \ - } - -#include GENL_MAGIC_INCLUDE_FILE - - -/* Functions for initializing structs to default values. */ - -#undef __field -#define __field(attr_nr, attr_flag, name, nla_type, type, __get, __put, \ - __is_signed) -#undef __array -#define __array(attr_nr, attr_flag, name, nla_type, type, maxlen, \ - __get, __put, __is_signed) -#undef __u32_field_def -#define __u32_field_def(attr_nr, attr_flag, name, default) \ - x->name = default; -#undef __s32_field_def -#define __s32_field_def(attr_nr, attr_flag, name, default) \ - x->name = default; -#undef __flg_field_def -#define __flg_field_def(attr_nr, attr_flag, name, default) \ - x->name = default; -#undef __str_field_def -#define __str_field_def(attr_nr, attr_flag, name, maxlen) \ - memset(x->name, 0, sizeof(x->name)); \ - x->name ## _len = 0; -#undef GENL_struct -#define GENL_struct(tag_name, tag_number, s_name, s_fields) \ -static void set_ ## s_name ## _defaults(struct s_name *x) __attribute__((unused)); \ -static void set_ ## s_name ## _defaults(struct s_name *x) { \ -s_fields \ -} - -#include GENL_MAGIC_INCLUDE_FILE - -#endif /* __KERNEL__ */ - -/* }}}1 */ -#endif /* GENL_MAGIC_FUNC_H */ -/* vim: set foldmethod=marker foldlevel=1 nofoldenable : */ diff --git a/trunk/include/linux/genl_magic_struct.h b/trunk/include/linux/genl_magic_struct.h deleted file mode 100644 index eecd19b37001..000000000000 --- a/trunk/include/linux/genl_magic_struct.h +++ /dev/null @@ -1,277 +0,0 @@ -#ifndef GENL_MAGIC_STRUCT_H -#define GENL_MAGIC_STRUCT_H - -#ifndef GENL_MAGIC_FAMILY -# error "you need to define GENL_MAGIC_FAMILY before inclusion" -#endif - -#ifndef GENL_MAGIC_VERSION -# error "you need to define GENL_MAGIC_VERSION before inclusion" -#endif - -#ifndef GENL_MAGIC_INCLUDE_FILE -# error "you need to define GENL_MAGIC_INCLUDE_FILE before inclusion" -#endif - -#include -#include - -#define CONCAT__(a,b) a ## b -#define CONCAT_(a,b) CONCAT__(a,b) - -extern int CONCAT_(GENL_MAGIC_FAMILY, _genl_register)(void); -extern void CONCAT_(GENL_MAGIC_FAMILY, _genl_unregister)(void); - -/* - * Extension of genl attribute validation policies {{{2 - */ - -/* - * @DRBD_GENLA_F_MANDATORY: By default, netlink ignores attributes it does not - * know about. This flag can be set in nlattr->nla_type to indicate that this - * attribute must not be ignored. - * - * We check and remove this flag in drbd_nla_check_mandatory() before - * validating the attribute types and lengths via nla_parse_nested(). - */ -#define DRBD_GENLA_F_MANDATORY (1 << 14) - -/* - * Flags specific to drbd and not visible at the netlink layer, used in - * _from_attrs and _to_skb: - * - * @DRBD_F_REQUIRED: Attribute is required; a request without this attribute is - * invalid. - * - * @DRBD_F_SENSITIVE: Attribute includes sensitive information and must not be - * included in unpriviledged get requests or broadcasts. - * - * @DRBD_F_INVARIANT: Attribute is set when an object is initially created, but - * cannot subsequently be changed. - */ -#define DRBD_F_REQUIRED (1 << 0) -#define DRBD_F_SENSITIVE (1 << 1) -#define DRBD_F_INVARIANT (1 << 2) - -#define __nla_type(x) ((__u16)((x) & NLA_TYPE_MASK & ~DRBD_GENLA_F_MANDATORY)) - -/* }}}1 - * MAGIC - * multi-include macro expansion magic starts here - */ - -/* MAGIC helpers {{{2 */ - -/* possible field types */ -#define __flg_field(attr_nr, attr_flag, name) \ - __field(attr_nr, attr_flag, name, NLA_U8, char, \ - nla_get_u8, nla_put_u8, false) -#define __u8_field(attr_nr, attr_flag, name) \ - __field(attr_nr, attr_flag, name, NLA_U8, unsigned char, \ - nla_get_u8, nla_put_u8, false) -#define __u16_field(attr_nr, attr_flag, name) \ - __field(attr_nr, attr_flag, name, NLA_U16, __u16, \ - nla_get_u16, nla_put_u16, false) -#define __u32_field(attr_nr, attr_flag, name) \ - __field(attr_nr, attr_flag, name, NLA_U32, __u32, \ - nla_get_u32, nla_put_u32, false) -#define __s32_field(attr_nr, attr_flag, name) \ - __field(attr_nr, attr_flag, name, NLA_U32, __s32, \ - nla_get_u32, nla_put_u32, true) -#define __u64_field(attr_nr, attr_flag, name) \ - __field(attr_nr, attr_flag, name, NLA_U64, __u64, \ - nla_get_u64, nla_put_u64, false) -#define __str_field(attr_nr, attr_flag, name, maxlen) \ - __array(attr_nr, attr_flag, name, NLA_NUL_STRING, char, maxlen, \ - nla_strlcpy, nla_put, false) -#define __bin_field(attr_nr, attr_flag, name, maxlen) \ - __array(attr_nr, attr_flag, name, NLA_BINARY, char, maxlen, \ - nla_memcpy, nla_put, false) - -/* fields with default values */ -#define __flg_field_def(attr_nr, attr_flag, name, default) \ - __flg_field(attr_nr, attr_flag, name) -#define __u32_field_def(attr_nr, attr_flag, name, default) \ - __u32_field(attr_nr, attr_flag, name) -#define __s32_field_def(attr_nr, attr_flag, name, default) \ - __s32_field(attr_nr, attr_flag, name) -#define __str_field_def(attr_nr, attr_flag, name, maxlen) \ - __str_field(attr_nr, attr_flag, name, maxlen) - -#define GENL_op_init(args...) args -#define GENL_doit(handler) \ - .doit = handler, \ - .flags = GENL_ADMIN_PERM, -#define GENL_dumpit(handler) \ - .dumpit = handler, \ - .flags = GENL_ADMIN_PERM, - -/* }}}1 - * Magic: define the enum symbols for genl_ops - * Magic: define the enum symbols for top level attributes - * Magic: define the enum symbols for nested attributes - * {{{2 - */ - -#undef GENL_struct -#define GENL_struct(tag_name, tag_number, s_name, s_fields) - -#undef GENL_mc_group -#define GENL_mc_group(group) - -#undef GENL_notification -#define GENL_notification(op_name, op_num, mcast_group, tla_list) \ - op_name = op_num, - -#undef GENL_op -#define GENL_op(op_name, op_num, handler, tla_list) \ - op_name = op_num, - -enum { -#include GENL_MAGIC_INCLUDE_FILE -}; - -#undef GENL_notification -#define GENL_notification(op_name, op_num, mcast_group, tla_list) - -#undef GENL_op -#define GENL_op(op_name, op_num, handler, attr_list) - -#undef GENL_struct -#define GENL_struct(tag_name, tag_number, s_name, s_fields) \ - tag_name = tag_number, - -enum { -#include GENL_MAGIC_INCLUDE_FILE -}; - -#undef GENL_struct -#define GENL_struct(tag_name, tag_number, s_name, s_fields) \ -enum { \ - s_fields \ -}; - -#undef __field -#define __field(attr_nr, attr_flag, name, nla_type, type, \ - __get, __put, __is_signed) \ - T_ ## name = (__u16)(attr_nr | ((attr_flag) & DRBD_GENLA_F_MANDATORY)), - -#undef __array -#define __array(attr_nr, attr_flag, name, nla_type, type, \ - maxlen, __get, __put, __is_signed) \ - T_ ## name = (__u16)(attr_nr | ((attr_flag) & DRBD_GENLA_F_MANDATORY)), - -#include GENL_MAGIC_INCLUDE_FILE - -/* }}}1 - * Magic: compile time assert unique numbers for operations - * Magic: -"- unique numbers for top level attributes - * Magic: -"- unique numbers for nested attributes - * {{{2 - */ - -#undef GENL_struct -#define GENL_struct(tag_name, tag_number, s_name, s_fields) - -#undef GENL_op -#define GENL_op(op_name, op_num, handler, attr_list) \ - case op_name: - -#undef GENL_notification -#define GENL_notification(op_name, op_num, mcast_group, tla_list) \ - case op_name: - -static inline void ct_assert_unique_operations(void) -{ - switch (0) { -#include GENL_MAGIC_INCLUDE_FILE - ; - } -} - -#undef GENL_op -#define GENL_op(op_name, op_num, handler, attr_list) - -#undef GENL_notification -#define GENL_notification(op_name, op_num, mcast_group, tla_list) - -#undef GENL_struct -#define GENL_struct(tag_name, tag_number, s_name, s_fields) \ - case tag_number: - -static inline void ct_assert_unique_top_level_attributes(void) -{ - switch (0) { -#include GENL_MAGIC_INCLUDE_FILE - ; - } -} - -#undef GENL_struct -#define GENL_struct(tag_name, tag_number, s_name, s_fields) \ -static inline void ct_assert_unique_ ## s_name ## _attributes(void) \ -{ \ - switch (0) { \ - s_fields \ - ; \ - } \ -} - -#undef __field -#define __field(attr_nr, attr_flag, name, nla_type, type, __get, __put, \ - __is_signed) \ - case attr_nr: - -#undef __array -#define __array(attr_nr, attr_flag, name, nla_type, type, maxlen, \ - __get, __put, __is_signed) \ - case attr_nr: - -#include GENL_MAGIC_INCLUDE_FILE - -/* }}}1 - * Magic: declare structs - * struct { - * fields - * }; - * {{{2 - */ - -#undef GENL_struct -#define GENL_struct(tag_name, tag_number, s_name, s_fields) \ -struct s_name { s_fields }; - -#undef __field -#define __field(attr_nr, attr_flag, name, nla_type, type, __get, __put, \ - __is_signed) \ - type name; - -#undef __array -#define __array(attr_nr, attr_flag, name, nla_type, type, maxlen, \ - __get, __put, __is_signed) \ - type name[maxlen]; \ - __u32 name ## _len; - -#include GENL_MAGIC_INCLUDE_FILE - -#undef GENL_struct -#define GENL_struct(tag_name, tag_number, s_name, s_fields) \ -enum { \ - s_fields \ -}; - -#undef __field -#define __field(attr_nr, attr_flag, name, nla_type, type, __get, __put, \ - is_signed) \ - F_ ## name ## _IS_SIGNED = is_signed, - -#undef __array -#define __array(attr_nr, attr_flag, name, nla_type, type, maxlen, \ - __get, __put, is_signed) \ - F_ ## name ## _IS_SIGNED = is_signed, - -#include GENL_MAGIC_INCLUDE_FILE - -/* }}}1 */ -#endif /* GENL_MAGIC_STRUCT_H */ -/* vim: set foldmethod=marker nofoldenable : */ diff --git a/trunk/include/linux/idr.h b/trunk/include/linux/idr.h index de7e190f1af4..87259a44c251 100644 --- a/trunk/include/linux/idr.h +++ b/trunk/include/linux/idr.h @@ -152,15 +152,4 @@ void ida_simple_remove(struct ida *ida, unsigned int id); void __init idr_init_cache(void); -/** - * idr_for_each_entry - iterate over an idr's elements of a given type - * @idp: idr handle - * @entry: the type * to use as cursor - * @id: id entry's key - */ -#define idr_for_each_entry(idp, entry, id) \ - for (id = 0, entry = (typeof(entry))idr_get_next((idp), &(id)); \ - entry != NULL; \ - ++id, entry = (typeof(entry))idr_get_next((idp), &(id))) - #endif /* __IDR_H__ */ diff --git a/trunk/include/linux/ipc_namespace.h b/trunk/include/linux/ipc_namespace.h index fe771978e877..5499c92a9153 100644 --- a/trunk/include/linux/ipc_namespace.h +++ b/trunk/include/linux/ipc_namespace.h @@ -67,8 +67,6 @@ struct ipc_namespace { /* user_ns which owns the ipc ns */ struct user_namespace *user_ns; - - unsigned int proc_inum; }; extern struct ipc_namespace init_ipc_ns; @@ -135,8 +133,7 @@ static inline int mq_init_ns(struct ipc_namespace *ns) { return 0; } #if defined(CONFIG_IPC_NS) extern struct ipc_namespace *copy_ipcs(unsigned long flags, - struct user_namespace *user_ns, struct ipc_namespace *ns); - + struct task_struct *tsk); static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns) { if (ns) @@ -147,12 +144,12 @@ static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns) extern void put_ipc_ns(struct ipc_namespace *ns); #else static inline struct ipc_namespace *copy_ipcs(unsigned long flags, - struct user_namespace *user_ns, struct ipc_namespace *ns) + struct task_struct *tsk) { if (flags & CLONE_NEWIPC) return ERR_PTR(-EINVAL); - return ns; + return tsk->nsproxy->ipc_ns; } static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns) diff --git a/trunk/include/linux/loop.h b/trunk/include/linux/loop.h index 460b60fa7adf..6492181bcb1d 100644 --- a/trunk/include/linux/loop.h +++ b/trunk/include/linux/loop.h @@ -53,13 +53,10 @@ struct loop_device { spinlock_t lo_lock; struct bio_list lo_bio_list; - unsigned int lo_bio_count; int lo_state; struct mutex lo_ctl_mutex; struct task_struct *lo_thread; wait_queue_head_t lo_event; - /* wait queue for incoming requests */ - wait_queue_head_t lo_req_wait; struct request_queue *lo_queue; struct gendisk *lo_disk; diff --git a/trunk/include/linux/lru_cache.h b/trunk/include/linux/lru_cache.h index 4019013c6593..cafc7f99e124 100644 --- a/trunk/include/linux/lru_cache.h +++ b/trunk/include/linux/lru_cache.h @@ -166,11 +166,9 @@ struct lc_element { /* if we want to track a larger set of objects, * it needs to become arch independend u64 */ unsigned lc_number; + /* special label when on free list */ #define LC_FREE (~0U) - - /* for pending changes */ - unsigned lc_new_number; }; struct lru_cache { @@ -178,7 +176,6 @@ struct lru_cache { struct list_head lru; struct list_head free; struct list_head in_use; - struct list_head to_be_changed; /* the pre-created kmem cache to allocate the objects from */ struct kmem_cache *lc_cache; @@ -189,7 +186,7 @@ struct lru_cache { size_t element_off; /* number of elements (indices) */ - unsigned int nr_elements; + unsigned int nr_elements; /* Arbitrary limit on maximum tracked objects. Practical limit is much * lower due to allocation failures, probably. For typical use cases, * nr_elements should be a few thousand at most. @@ -197,19 +194,18 @@ struct lru_cache { * 8 high bits of .lc_index to be overloaded with flags in the future. */ #define LC_MAX_ACTIVE (1<<24) - /* allow to accumulate a few (index:label) changes, - * but no more than max_pending_changes */ - unsigned int max_pending_changes; - /* number of elements currently on to_be_changed list */ - unsigned int pending_changes; - /* statistics */ - unsigned used; /* number of elements currently on in_use list */ - unsigned long hits, misses, starving, locked, changed; + unsigned used; /* number of lelements currently on in_use list */ + unsigned long hits, misses, starving, dirty, changed; /* see below: flag-bits for lru_cache */ unsigned long flags; + /* when changing the label of an index element */ + unsigned int new_number; + + /* for paranoia when changing the label of an index element */ + struct lc_element *changing_element; void *lc_private; const char *name; @@ -225,15 +221,10 @@ enum { /* debugging aid, to catch concurrent access early. * user needs to guarantee exclusive access by proper locking! */ __LC_PARANOIA, - - /* annotate that the set is "dirty", possibly accumulating further - * changes, until a transaction is finally triggered */ + /* if we need to change the set, but currently there is a changing + * transaction pending, we are "dirty", and must deferr further + * changing requests */ __LC_DIRTY, - - /* Locked, no further changes allowed. - * Also used to serialize changing transactions. */ - __LC_LOCKED, - /* if we need to change the set, but currently there is no free nor * unused element available, we are "starving", and must not give out * further references, to guarantee that eventually some refcnt will @@ -245,11 +236,9 @@ enum { }; #define LC_PARANOIA (1<<__LC_PARANOIA) #define LC_DIRTY (1<<__LC_DIRTY) -#define LC_LOCKED (1<<__LC_LOCKED) #define LC_STARVING (1<<__LC_STARVING) extern struct lru_cache *lc_create(const char *name, struct kmem_cache *cache, - unsigned max_pending_changes, unsigned e_count, size_t e_size, size_t e_off); extern void lc_reset(struct lru_cache *lc); extern void lc_destroy(struct lru_cache *lc); @@ -260,7 +249,7 @@ extern struct lc_element *lc_try_get(struct lru_cache *lc, unsigned int enr); extern struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr); extern struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr); extern unsigned int lc_put(struct lru_cache *lc, struct lc_element *e); -extern void lc_committed(struct lru_cache *lc); +extern void lc_changed(struct lru_cache *lc, struct lc_element *e); struct seq_file; extern size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc); @@ -269,29 +258,17 @@ extern void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char void (*detail) (struct seq_file *, struct lc_element *)); /** - * lc_try_lock_for_transaction - can be used to stop lc_get() from changing the tracked set + * lc_try_lock - can be used to stop lc_get() from changing the tracked set * @lc: the lru cache to operate on * - * Allows (expects) the set to be "dirty". Note that the reference counts and - * order on the active and lru lists may still change. Used to serialize - * changing transactions. Returns true if we aquired the lock. + * Note that the reference counts and order on the active and lru lists may + * still change. Returns true if we acquired the lock. */ -static inline int lc_try_lock_for_transaction(struct lru_cache *lc) +static inline int lc_try_lock(struct lru_cache *lc) { - return !test_and_set_bit(__LC_LOCKED, &lc->flags); + return !test_and_set_bit(__LC_DIRTY, &lc->flags); } -/** - * lc_try_lock - variant to stop lc_get() from changing the tracked set - * @lc: the lru cache to operate on - * - * Note that the reference counts and order on the active and lru lists may - * still change. Only works on a "clean" set. Returns true if we aquired the - * lock, which means there are no pending changes, and any further attempt to - * change the set will not succeed until the next lc_unlock(). - */ -extern int lc_try_lock(struct lru_cache *lc); - /** * lc_unlock - unlock @lc, allow lc_get() to change the set again * @lc: the lru cache to operate on @@ -299,10 +276,14 @@ extern int lc_try_lock(struct lru_cache *lc); static inline void lc_unlock(struct lru_cache *lc) { clear_bit(__LC_DIRTY, &lc->flags); - clear_bit_unlock(__LC_LOCKED, &lc->flags); + smp_mb__after_clear_bit(); } -extern bool lc_is_used(struct lru_cache *lc, unsigned int enr); +static inline int lc_is_used(struct lru_cache *lc, unsigned int enr) +{ + struct lc_element *e = lc_find(lc, enr); + return e && e->refcnt; +} #define lc_entry(ptr, type, member) \ container_of(ptr, type, member) diff --git a/trunk/include/linux/mnt_namespace.h b/trunk/include/linux/mnt_namespace.h index 12b2ab510323..5a8e3903d770 100644 --- a/trunk/include/linux/mnt_namespace.h +++ b/trunk/include/linux/mnt_namespace.h @@ -4,10 +4,9 @@ struct mnt_namespace; struct fs_struct; -struct user_namespace; extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace *, - struct user_namespace *, struct fs_struct *); + struct fs_struct *); extern void put_mnt_ns(struct mnt_namespace *ns); extern const struct file_operations proc_mounts_operations; diff --git a/trunk/include/linux/nsproxy.h b/trunk/include/linux/nsproxy.h index 10e5947491c7..cc37a55ad004 100644 --- a/trunk/include/linux/nsproxy.h +++ b/trunk/include/linux/nsproxy.h @@ -67,7 +67,7 @@ void exit_task_namespaces(struct task_struct *tsk); void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new); void free_nsproxy(struct nsproxy *ns); int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **, - struct cred *, struct fs_struct *); + struct fs_struct *); int __init nsproxy_cache_init(void); static inline void put_nsproxy(struct nsproxy *ns) diff --git a/trunk/include/linux/pid_namespace.h b/trunk/include/linux/pid_namespace.h index bf285999273a..65e3e87eacc5 100644 --- a/trunk/include/linux/pid_namespace.h +++ b/trunk/include/linux/pid_namespace.h @@ -21,7 +21,6 @@ struct pid_namespace { struct kref kref; struct pidmap pidmap[PIDMAP_ENTRIES]; int last_pid; - int nr_hashed; struct task_struct *child_reaper; struct kmem_cache *pid_cachep; unsigned int level; @@ -32,12 +31,9 @@ struct pid_namespace { #ifdef CONFIG_BSD_PROCESS_ACCT struct bsd_acct_struct *bacct; #endif - struct user_namespace *user_ns; - struct work_struct proc_work; kgid_t pid_gid; int hide_pid; int reboot; /* group exit code if this pidns was rebooted */ - unsigned int proc_inum; }; extern struct pid_namespace init_pid_ns; @@ -50,8 +46,7 @@ static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns) return ns; } -extern struct pid_namespace *copy_pid_ns(unsigned long flags, - struct user_namespace *user_ns, struct pid_namespace *ns); +extern struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *ns); extern void zap_pid_ns_processes(struct pid_namespace *pid_ns); extern int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd); extern void put_pid_ns(struct pid_namespace *ns); @@ -64,8 +59,8 @@ static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns) return ns; } -static inline struct pid_namespace *copy_pid_ns(unsigned long flags, - struct user_namespace *user_ns, struct pid_namespace *ns) +static inline struct pid_namespace * +copy_pid_ns(unsigned long flags, struct pid_namespace *ns) { if (flags & CLONE_NEWPID) ns = ERR_PTR(-EINVAL); diff --git a/trunk/include/linux/proc_fs.h b/trunk/include/linux/proc_fs.h index 2e24018b7cec..3fd2e871ff1b 100644 --- a/trunk/include/linux/proc_fs.h +++ b/trunk/include/linux/proc_fs.h @@ -28,11 +28,7 @@ struct mm_struct; */ enum { - PROC_ROOT_INO = 1, - PROC_IPC_INIT_INO = 0xEFFFFFFFU, - PROC_UTS_INIT_INO = 0xEFFFFFFEU, - PROC_USER_INIT_INO = 0xEFFFFFFDU, - PROC_PID_INIT_INO = 0xEFFFFFFCU, + PROC_ROOT_INO = 1, }; /* @@ -178,10 +174,7 @@ extern struct proc_dir_entry *proc_net_mkdir(struct net *net, const char *name, struct proc_dir_entry *parent); extern struct file *proc_ns_fget(int fd); -extern bool proc_ns_inode(struct inode *inode); -extern int proc_alloc_inum(unsigned int *pino); -extern void proc_free_inum(unsigned int inum); #else #define proc_net_fops_create(net, name, mode, fops) ({ (void)(mode), NULL; }) @@ -236,19 +229,6 @@ static inline struct file *proc_ns_fget(int fd) return ERR_PTR(-EINVAL); } -static inline bool proc_ns_inode(struct inode *inode) -{ - return false; -} - -static inline int proc_alloc_inum(unsigned int *inum) -{ - *inum = 1; - return 0; -} -static inline void proc_free_inum(unsigned int inum) -{ -} #endif /* CONFIG_PROC_FS */ #if !defined(CONFIG_PROC_KCORE) @@ -267,14 +247,10 @@ struct proc_ns_operations { void *(*get)(struct task_struct *task); void (*put)(void *ns); int (*install)(struct nsproxy *nsproxy, void *ns); - unsigned int (*inum)(void *ns); }; extern const struct proc_ns_operations netns_operations; extern const struct proc_ns_operations utsns_operations; extern const struct proc_ns_operations ipcns_operations; -extern const struct proc_ns_operations pidns_operations; -extern const struct proc_ns_operations userns_operations; -extern const struct proc_ns_operations mntns_operations; union proc_op { int (*proc_get_link)(struct dentry *, struct path *); diff --git a/trunk/include/linux/user_namespace.h b/trunk/include/linux/user_namespace.h index b9bd2e6c73cc..95142cae446a 100644 --- a/trunk/include/linux/user_namespace.h +++ b/trunk/include/linux/user_namespace.h @@ -25,7 +25,6 @@ struct user_namespace { struct user_namespace *parent; kuid_t owner; kgid_t group; - unsigned int proc_inum; }; extern struct user_namespace init_user_ns; @@ -40,7 +39,6 @@ static inline struct user_namespace *get_user_ns(struct user_namespace *ns) } extern int create_user_ns(struct cred *new); -extern int unshare_userns(unsigned long unshare_flags, struct cred **new_cred); extern void free_user_ns(struct kref *kref); static inline void put_user_ns(struct user_namespace *ns) @@ -68,14 +66,6 @@ static inline int create_user_ns(struct cred *new) return -EINVAL; } -static inline int unshare_userns(unsigned long unshare_flags, - struct cred **new_cred) -{ - if (unshare_flags & CLONE_NEWUSER) - return -EINVAL; - return 0; -} - static inline void put_user_ns(struct user_namespace *ns) { } diff --git a/trunk/include/linux/utsname.h b/trunk/include/linux/utsname.h index 239e27733d6c..2b345206722a 100644 --- a/trunk/include/linux/utsname.h +++ b/trunk/include/linux/utsname.h @@ -23,7 +23,6 @@ struct uts_namespace { struct kref kref; struct new_utsname name; struct user_namespace *user_ns; - unsigned int proc_inum; }; extern struct uts_namespace init_uts_ns; @@ -34,7 +33,7 @@ static inline void get_uts_ns(struct uts_namespace *ns) } extern struct uts_namespace *copy_utsname(unsigned long flags, - struct user_namespace *user_ns, struct uts_namespace *old_ns); + struct task_struct *tsk); extern void free_uts_ns(struct kref *kref); static inline void put_uts_ns(struct uts_namespace *ns) @@ -51,12 +50,12 @@ static inline void put_uts_ns(struct uts_namespace *ns) } static inline struct uts_namespace *copy_utsname(unsigned long flags, - struct user_namespace *user_ns, struct uts_namespace *old_ns) + struct task_struct *tsk) { if (flags & CLONE_NEWUTS) return ERR_PTR(-EINVAL); - return old_ns; + return tsk->nsproxy->uts_ns; } #endif diff --git a/trunk/include/linux/wait.h b/trunk/include/linux/wait.h index 7cb64d4b499d..168dfe122dd3 100644 --- a/trunk/include/linux/wait.h +++ b/trunk/include/linux/wait.h @@ -550,170 +550,6 @@ do { \ __ret; \ }) - -#define __wait_event_lock_irq(wq, condition, lock, cmd) \ -do { \ - DEFINE_WAIT(__wait); \ - \ - for (;;) { \ - prepare_to_wait(&wq, &__wait, TASK_UNINTERRUPTIBLE); \ - if (condition) \ - break; \ - spin_unlock_irq(&lock); \ - cmd; \ - schedule(); \ - spin_lock_irq(&lock); \ - } \ - finish_wait(&wq, &__wait); \ -} while (0) - -/** - * wait_event_lock_irq_cmd - sleep until a condition gets true. The - * condition is checked under the lock. This - * is expected to be called with the lock - * taken. - * @wq: the waitqueue to wait on - * @condition: a C expression for the event to wait for - * @lock: a locked spinlock_t, which will be released before cmd - * and schedule() and reacquired afterwards. - * @cmd: a command which is invoked outside the critical section before - * sleep - * - * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the - * @condition evaluates to true. The @condition is checked each time - * the waitqueue @wq is woken up. - * - * wake_up() has to be called after changing any variable that could - * change the result of the wait condition. - * - * This is supposed to be called while holding the lock. The lock is - * dropped before invoking the cmd and going to sleep and is reacquired - * afterwards. - */ -#define wait_event_lock_irq_cmd(wq, condition, lock, cmd) \ -do { \ - if (condition) \ - break; \ - __wait_event_lock_irq(wq, condition, lock, cmd); \ -} while (0) - -/** - * wait_event_lock_irq - sleep until a condition gets true. The - * condition is checked under the lock. This - * is expected to be called with the lock - * taken. - * @wq: the waitqueue to wait on - * @condition: a C expression for the event to wait for - * @lock: a locked spinlock_t, which will be released before schedule() - * and reacquired afterwards. - * - * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the - * @condition evaluates to true. The @condition is checked each time - * the waitqueue @wq is woken up. - * - * wake_up() has to be called after changing any variable that could - * change the result of the wait condition. - * - * This is supposed to be called while holding the lock. The lock is - * dropped before going to sleep and is reacquired afterwards. - */ -#define wait_event_lock_irq(wq, condition, lock) \ -do { \ - if (condition) \ - break; \ - __wait_event_lock_irq(wq, condition, lock, ); \ -} while (0) - - -#define __wait_event_interruptible_lock_irq(wq, condition, \ - lock, ret, cmd) \ -do { \ - DEFINE_WAIT(__wait); \ - \ - for (;;) { \ - prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE); \ - if (condition) \ - break; \ - if (signal_pending(current)) { \ - ret = -ERESTARTSYS; \ - break; \ - } \ - spin_unlock_irq(&lock); \ - cmd; \ - schedule(); \ - spin_lock_irq(&lock); \ - } \ - finish_wait(&wq, &__wait); \ -} while (0) - -/** - * wait_event_interruptible_lock_irq_cmd - sleep until a condition gets true. - * The condition is checked under the lock. This is expected to - * be called with the lock taken. - * @wq: the waitqueue to wait on - * @condition: a C expression for the event to wait for - * @lock: a locked spinlock_t, which will be released before cmd and - * schedule() and reacquired afterwards. - * @cmd: a command which is invoked outside the critical section before - * sleep - * - * The process is put to sleep (TASK_INTERRUPTIBLE) until the - * @condition evaluates to true or a signal is received. The @condition is - * checked each time the waitqueue @wq is woken up. - * - * wake_up() has to be called after changing any variable that could - * change the result of the wait condition. - * - * This is supposed to be called while holding the lock. The lock is - * dropped before invoking the cmd and going to sleep and is reacquired - * afterwards. - * - * The macro will return -ERESTARTSYS if it was interrupted by a signal - * and 0 if @condition evaluated to true. - */ -#define wait_event_interruptible_lock_irq_cmd(wq, condition, lock, cmd) \ -({ \ - int __ret = 0; \ - \ - if (!(condition)) \ - __wait_event_interruptible_lock_irq(wq, condition, \ - lock, __ret, cmd); \ - __ret; \ -}) - -/** - * wait_event_interruptible_lock_irq - sleep until a condition gets true. - * The condition is checked under the lock. This is expected - * to be called with the lock taken. - * @wq: the waitqueue to wait on - * @condition: a C expression for the event to wait for - * @lock: a locked spinlock_t, which will be released before schedule() - * and reacquired afterwards. - * - * The process is put to sleep (TASK_INTERRUPTIBLE) until the - * @condition evaluates to true or signal is received. The @condition is - * checked each time the waitqueue @wq is woken up. - * - * wake_up() has to be called after changing any variable that could - * change the result of the wait condition. - * - * This is supposed to be called while holding the lock. The lock is - * dropped before going to sleep and is reacquired afterwards. - * - * The macro will return -ERESTARTSYS if it was interrupted by a signal - * and 0 if @condition evaluated to true. - */ -#define wait_event_interruptible_lock_irq(wq, condition, lock) \ -({ \ - int __ret = 0; \ - \ - if (!(condition)) \ - __wait_event_interruptible_lock_irq(wq, condition, \ - lock, __ret, ); \ - __ret; \ -}) - - /* * These are the old interfaces to sleep waiting for an event. * They are racy. DO NOT use them, use the wait_event* interfaces above. diff --git a/trunk/include/net/net_namespace.h b/trunk/include/net/net_namespace.h index de644bcd8613..c5a43f56b796 100644 --- a/trunk/include/net/net_namespace.h +++ b/trunk/include/net/net_namespace.h @@ -56,8 +56,6 @@ struct net { struct user_namespace *user_ns; /* Owning user namespace */ - unsigned int proc_inum; - struct proc_dir_entry *proc_net; struct proc_dir_entry *proc_net_stat; diff --git a/trunk/init/Kconfig b/trunk/init/Kconfig index 675d8a2326cf..1a207efca591 100644 --- a/trunk/init/Kconfig +++ b/trunk/init/Kconfig @@ -1069,9 +1069,11 @@ config UIDGID_CONVERTED # Filesystems depends on 9P_FS = n depends on AFS_FS = n + depends on AUTOFS4_FS = n depends on CEPH_FS = n depends on CIFS = n depends on CODA_FS = n + depends on FUSE_FS = n depends on GFS2_FS = n depends on NCP_FS = n depends on NFSD = n diff --git a/trunk/init/do_mounts.c b/trunk/init/do_mounts.c index 1d1b6348f903..f8a66424360d 100644 --- a/trunk/init/do_mounts.c +++ b/trunk/init/do_mounts.c @@ -69,28 +69,23 @@ __setup("ro", readonly); __setup("rw", readwrite); #ifdef CONFIG_BLOCK -struct uuidcmp { - const char *uuid; - int len; -}; - /** * match_dev_by_uuid - callback for finding a partition using its uuid * @dev: device passed in by the caller - * @data: opaque pointer to the desired struct uuidcmp to match + * @data: opaque pointer to a 36 byte char array with a UUID * * Returns 1 if the device matches, and 0 otherwise. */ static int match_dev_by_uuid(struct device *dev, void *data) { - struct uuidcmp *cmp = data; + u8 *uuid = data; struct hd_struct *part = dev_to_part(dev); if (!part->info) goto no_match; - if (strncasecmp(cmp->uuid, part->info->uuid, cmp->len)) - goto no_match; + if (memcmp(uuid, part->info->uuid, sizeof(part->info->uuid))) + goto no_match; return 1; no_match: @@ -100,7 +95,7 @@ static int match_dev_by_uuid(struct device *dev, void *data) /** * devt_from_partuuid - looks up the dev_t of a partition by its UUID - * @uuid: char array containing ascii UUID + * @uuid: min 36 byte char array containing a hex ascii UUID * * The function will return the first partition which contains a matching * UUID value in its partition_meta_info struct. This does not search @@ -111,41 +106,38 @@ static int match_dev_by_uuid(struct device *dev, void *data) * * Returns the matching dev_t on success or 0 on failure. */ -static dev_t devt_from_partuuid(const char *uuid_str) +static dev_t devt_from_partuuid(char *uuid_str) { dev_t res = 0; - struct uuidcmp cmp; struct device *dev = NULL; + u8 uuid[16]; struct gendisk *disk; struct hd_struct *part; int offset = 0; - bool clear_root_wait = false; - char *slash; - cmp.uuid = uuid_str; + if (strlen(uuid_str) < 36) + goto done; - slash = strchr(uuid_str, '/'); /* Check for optional partition number offset attributes. */ - if (slash) { + if (uuid_str[36]) { char c = 0; /* Explicitly fail on poor PARTUUID syntax. */ - if (sscanf(slash + 1, - "PARTNROFF=%d%c", &offset, &c) != 1) { - clear_root_wait = true; + if (sscanf(&uuid_str[36], + "/PARTNROFF=%d%c", &offset, &c) != 1) { + printk(KERN_ERR "VFS: PARTUUID= is invalid.\n" + "Expected PARTUUID=[/PARTNROFF=%%d]\n"); + if (root_wait) + printk(KERN_ERR + "Disabling rootwait; root= is invalid.\n"); + root_wait = 0; goto done; } - cmp.len = slash - uuid_str; - } else { - cmp.len = strlen(uuid_str); } - if (!cmp.len) { - clear_root_wait = true; - goto done; - } + /* Pack the requested UUID in the expected format. */ + part_pack_uuid(uuid_str, uuid); - dev = class_find_device(&block_class, NULL, &cmp, - &match_dev_by_uuid); + dev = class_find_device(&block_class, NULL, uuid, &match_dev_by_uuid); if (!dev) goto done; @@ -166,13 +158,6 @@ static dev_t devt_from_partuuid(const char *uuid_str) no_offset: put_device(dev); done: - if (clear_root_wait) { - pr_err("VFS: PARTUUID= is invalid.\n" - "Expected PARTUUID=[/PARTNROFF=%%d]\n"); - if (root_wait) - pr_err("Disabling rootwait; root= is invalid.\n"); - root_wait = 0; - } return res; } #endif @@ -189,10 +174,6 @@ static dev_t devt_from_partuuid(const char *uuid_str) * used when disk name of partitioned disk ends on a digit. * 6) PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF representing the * unique id of a partition if the partition table provides it. - * The UUID may be either an EFI/GPT UUID, or refer to an MSDOS - * partition using the format SSSSSSSS-PP, where SSSSSSSS is a zero- - * filled hex representation of the 32-bit "NT disk signature", and PP - * is a zero-filled hex representation of the 1-based partition number. * 7) PARTUUID=/PARTNROFF= to select a partition in relation to * a partition with a known unique id. * diff --git a/trunk/init/main.c b/trunk/init/main.c index baf1f0f5c461..63ae904a99a8 100644 --- a/trunk/init/main.c +++ b/trunk/init/main.c @@ -812,6 +812,7 @@ static int __ref kernel_init(void *unused) system_state = SYSTEM_RUNNING; numa_default_policy(); + current->signal->flags |= SIGNAL_UNKILLABLE; flush_delayed_fput(); if (ramdisk_execute_command) { diff --git a/trunk/init/version.c b/trunk/init/version.c index 58170f18912d..86fe0ccb997a 100644 --- a/trunk/init/version.c +++ b/trunk/init/version.c @@ -12,7 +12,6 @@ #include #include #include -#include #ifndef CONFIG_KALLSYMS #define version(a) Version_ ## a @@ -35,7 +34,6 @@ struct uts_namespace init_uts_ns = { .domainname = UTS_DOMAINNAME, }, .user_ns = &init_user_ns, - .proc_inum = PROC_UTS_INIT_INO, }; EXPORT_SYMBOL_GPL(init_uts_ns); diff --git a/trunk/ipc/msgutil.c b/trunk/ipc/msgutil.c index 6471f1bdae96..26143d377c95 100644 --- a/trunk/ipc/msgutil.c +++ b/trunk/ipc/msgutil.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include "util.h" @@ -31,7 +30,6 @@ DEFINE_SPINLOCK(mq_lock); struct ipc_namespace init_ipc_ns = { .count = ATOMIC_INIT(1), .user_ns = &init_user_ns, - .proc_inum = PROC_IPC_INIT_INO, }; atomic_t nr_ipc_ns = ATOMIC_INIT(1); diff --git a/trunk/ipc/namespace.c b/trunk/ipc/namespace.c index cf3386a51de2..f362298c5ce4 100644 --- a/trunk/ipc/namespace.c +++ b/trunk/ipc/namespace.c @@ -16,7 +16,7 @@ #include "util.h" -static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, +static struct ipc_namespace *create_ipc_ns(struct task_struct *tsk, struct ipc_namespace *old_ns) { struct ipc_namespace *ns; @@ -26,16 +26,9 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, if (ns == NULL) return ERR_PTR(-ENOMEM); - err = proc_alloc_inum(&ns->proc_inum); - if (err) { - kfree(ns); - return ERR_PTR(err); - } - atomic_set(&ns->count, 1); err = mq_init_ns(ns); if (err) { - proc_free_inum(ns->proc_inum); kfree(ns); return ERR_PTR(err); } @@ -53,17 +46,19 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, ipcns_notify(IPCNS_CREATED); register_ipcns_notifier(ns); - ns->user_ns = get_user_ns(user_ns); + ns->user_ns = get_user_ns(task_cred_xxx(tsk, user_ns)); return ns; } struct ipc_namespace *copy_ipcs(unsigned long flags, - struct user_namespace *user_ns, struct ipc_namespace *ns) + struct task_struct *tsk) { + struct ipc_namespace *ns = tsk->nsproxy->ipc_ns; + if (!(flags & CLONE_NEWIPC)) return get_ipc_ns(ns); - return create_ipc_ns(user_ns, ns); + return create_ipc_ns(tsk, ns); } /* @@ -118,7 +113,6 @@ static void free_ipc_ns(struct ipc_namespace *ns) */ ipcns_notify(IPCNS_REMOVED); put_user_ns(ns->user_ns); - proc_free_inum(ns->proc_inum); kfree(ns); } @@ -167,12 +161,8 @@ static void ipcns_put(void *ns) return put_ipc_ns(ns); } -static int ipcns_install(struct nsproxy *nsproxy, void *new) +static int ipcns_install(struct nsproxy *nsproxy, void *ns) { - struct ipc_namespace *ns = new; - if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) - return -EPERM; - /* Ditch state from the old ipc namespace */ exit_sem(current); put_ipc_ns(nsproxy->ipc_ns); @@ -180,18 +170,10 @@ static int ipcns_install(struct nsproxy *nsproxy, void *new) return 0; } -static unsigned int ipcns_inum(void *vp) -{ - struct ipc_namespace *ns = vp; - - return ns->proc_inum; -} - const struct proc_ns_operations ipcns_operations = { .name = "ipc", .type = CLONE_NEWIPC, .get = ipcns_get, .put = ipcns_put, .install = ipcns_install, - .inum = ipcns_inum, }; diff --git a/trunk/kernel/cgroup.c b/trunk/kernel/cgroup.c index 9915ffe01372..f34c41bfaa37 100644 --- a/trunk/kernel/cgroup.c +++ b/trunk/kernel/cgroup.c @@ -3409,7 +3409,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, { struct cgroup_pidlist *l; /* don't need task_nsproxy() if we're looking at ourself */ - struct pid_namespace *ns = task_active_pid_ns(current); + struct pid_namespace *ns = current->nsproxy->pid_ns; /* * We can't drop the pidlist_mutex before taking the l->mutex in case diff --git a/trunk/kernel/events/core.c b/trunk/kernel/events/core.c index 301079d06f24..f9ff5493171d 100644 --- a/trunk/kernel/events/core.c +++ b/trunk/kernel/events/core.c @@ -6155,7 +6155,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, event->parent = parent_event; - event->ns = get_pid_ns(task_active_pid_ns(current)); + event->ns = get_pid_ns(current->nsproxy->pid_ns); event->id = atomic64_inc_return(&perf_event_id); event->state = PERF_EVENT_STATE_INACTIVE; diff --git a/trunk/kernel/exit.c b/trunk/kernel/exit.c index b4df21937216..50d2e93c36ea 100644 --- a/trunk/kernel/exit.c +++ b/trunk/kernel/exit.c @@ -72,6 +72,18 @@ static void __unhash_process(struct task_struct *p, bool group_dead) list_del_rcu(&p->tasks); list_del_init(&p->sibling); __this_cpu_dec(process_counts); + /* + * If we are the last child process in a pid namespace to be + * reaped, notify the reaper sleeping zap_pid_ns_processes(). + */ + if (IS_ENABLED(CONFIG_PID_NS)) { + struct task_struct *parent = p->real_parent; + + if ((task_active_pid_ns(parent)->child_reaper == parent) && + list_empty(&parent->children) && + (parent->flags & PF_EXITING)) + wake_up_process(parent); + } } list_del_rcu(&p->thread_group); } diff --git a/trunk/kernel/fork.c b/trunk/kernel/fork.c index c36c4e301efe..115d6c2e4cca 100644 --- a/trunk/kernel/fork.c +++ b/trunk/kernel/fork.c @@ -1044,6 +1044,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) atomic_set(&sig->live, 1); atomic_set(&sig->sigcnt, 1); init_waitqueue_head(&sig->wait_chldexit); + if (clone_flags & CLONE_NEWPID) + sig->flags |= SIGNAL_UNKILLABLE; sig->curr_target = tsk; init_sigpending(&sig->shared_pending); INIT_LIST_HEAD(&sig->posix_timers); @@ -1436,10 +1438,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); if (thread_group_leader(p)) { - if (is_child_reaper(pid)) { - ns_of_pid(pid)->child_reaper = p; - p->signal->flags |= SIGNAL_UNKILLABLE; - } + if (is_child_reaper(pid)) + p->nsproxy->pid_ns->child_reaper = p; p->signal->leader_pid = pid; p->signal->tty = tty_kref_get(current->signal->tty); @@ -1473,6 +1473,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (p->io_context) exit_io_context(p); bad_fork_cleanup_namespaces: + if (unlikely(clone_flags & CLONE_NEWPID)) + pid_ns_release_proc(p->nsproxy->pid_ns); exit_task_namespaces(p); bad_fork_cleanup_mm: if (p->mm) @@ -1552,9 +1554,15 @@ long do_fork(unsigned long clone_flags, * Do some preliminary argument and permissions checking before we * actually start allocating stuff */ - if (clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) { - if (clone_flags & (CLONE_THREAD|CLONE_PARENT)) + if (clone_flags & CLONE_NEWUSER) { + if (clone_flags & CLONE_THREAD) return -EINVAL; + /* hopefully this check will go away when userns support is + * complete + */ + if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) || + !capable(CAP_SETGID)) + return -EPERM; } /* @@ -1716,8 +1724,7 @@ static int check_unshare_flags(unsigned long unshare_flags) { if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| - CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET| - CLONE_NEWUSER|CLONE_NEWPID)) + CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET)) return -EINVAL; /* * Not implemented, but pretend it works if there is nothing to @@ -1784,40 +1791,19 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) { struct fs_struct *fs, *new_fs = NULL; struct files_struct *fd, *new_fd = NULL; - struct cred *new_cred = NULL; struct nsproxy *new_nsproxy = NULL; int do_sysvsem = 0; int err; - /* - * If unsharing a user namespace must also unshare the thread. - */ - if (unshare_flags & CLONE_NEWUSER) - unshare_flags |= CLONE_THREAD; - /* - * If unsharing a pid namespace must also unshare the thread. - */ - if (unshare_flags & CLONE_NEWPID) - unshare_flags |= CLONE_THREAD; - /* - * If unsharing a thread from a thread group, must also unshare vm. - */ - if (unshare_flags & CLONE_THREAD) - unshare_flags |= CLONE_VM; - /* - * If unsharing vm, must also unshare signal handlers. - */ - if (unshare_flags & CLONE_VM) - unshare_flags |= CLONE_SIGHAND; + err = check_unshare_flags(unshare_flags); + if (err) + goto bad_unshare_out; + /* * If unsharing namespace, must also unshare filesystem information. */ if (unshare_flags & CLONE_NEWNS) unshare_flags |= CLONE_FS; - - err = check_unshare_flags(unshare_flags); - if (err) - goto bad_unshare_out; /* * CLONE_NEWIPC must also detach from the undolist: after switching * to a new ipc namespace, the semaphore arrays from the old @@ -1831,15 +1817,11 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) err = unshare_fd(unshare_flags, &new_fd); if (err) goto bad_unshare_cleanup_fs; - err = unshare_userns(unshare_flags, &new_cred); + err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs); if (err) goto bad_unshare_cleanup_fd; - err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, - new_cred, new_fs); - if (err) - goto bad_unshare_cleanup_cred; - if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) { + if (new_fs || new_fd || do_sysvsem || new_nsproxy) { if (do_sysvsem) { /* * CLONE_SYSVSEM is equivalent to sys_exit(). @@ -1872,20 +1854,11 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) } task_unlock(current); - - if (new_cred) { - /* Install the new user namespace */ - commit_creds(new_cred); - new_cred = NULL; - } } if (new_nsproxy) put_nsproxy(new_nsproxy); -bad_unshare_cleanup_cred: - if (new_cred) - put_cred(new_cred); bad_unshare_cleanup_fd: if (new_fd) put_files_struct(new_fd); diff --git a/trunk/kernel/nsproxy.c b/trunk/kernel/nsproxy.c index 78e2ecb20165..7e1c3de1ce45 100644 --- a/trunk/kernel/nsproxy.c +++ b/trunk/kernel/nsproxy.c @@ -57,8 +57,7 @@ static inline struct nsproxy *create_nsproxy(void) * leave it to the caller to do proper locking and attach it to task. */ static struct nsproxy *create_new_namespaces(unsigned long flags, - struct task_struct *tsk, struct user_namespace *user_ns, - struct fs_struct *new_fs) + struct task_struct *tsk, struct fs_struct *new_fs) { struct nsproxy *new_nsp; int err; @@ -67,31 +66,31 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, if (!new_nsp) return ERR_PTR(-ENOMEM); - new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns, new_fs); + new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, new_fs); if (IS_ERR(new_nsp->mnt_ns)) { err = PTR_ERR(new_nsp->mnt_ns); goto out_ns; } - new_nsp->uts_ns = copy_utsname(flags, user_ns, tsk->nsproxy->uts_ns); + new_nsp->uts_ns = copy_utsname(flags, tsk); if (IS_ERR(new_nsp->uts_ns)) { err = PTR_ERR(new_nsp->uts_ns); goto out_uts; } - new_nsp->ipc_ns = copy_ipcs(flags, user_ns, tsk->nsproxy->ipc_ns); + new_nsp->ipc_ns = copy_ipcs(flags, tsk); if (IS_ERR(new_nsp->ipc_ns)) { err = PTR_ERR(new_nsp->ipc_ns); goto out_ipc; } - new_nsp->pid_ns = copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns); + new_nsp->pid_ns = copy_pid_ns(flags, task_active_pid_ns(tsk)); if (IS_ERR(new_nsp->pid_ns)) { err = PTR_ERR(new_nsp->pid_ns); goto out_pid; } - new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns); + new_nsp->net_ns = copy_net_ns(flags, task_cred_xxx(tsk, user_ns), tsk->nsproxy->net_ns); if (IS_ERR(new_nsp->net_ns)) { err = PTR_ERR(new_nsp->net_ns); goto out_net; @@ -123,7 +122,6 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, int copy_namespaces(unsigned long flags, struct task_struct *tsk) { struct nsproxy *old_ns = tsk->nsproxy; - struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns); struct nsproxy *new_ns; int err = 0; @@ -136,7 +134,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) CLONE_NEWPID | CLONE_NEWNET))) return 0; - if (!ns_capable(user_ns, CAP_SYS_ADMIN)) { + if (!capable(CAP_SYS_ADMIN)) { err = -EPERM; goto out; } @@ -153,8 +151,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) goto out; } - new_ns = create_new_namespaces(flags, tsk, - task_cred_xxx(tsk, user_ns), tsk->fs); + new_ns = create_new_namespaces(flags, tsk, tsk->fs); if (IS_ERR(new_ns)) { err = PTR_ERR(new_ns); goto out; @@ -186,21 +183,19 @@ void free_nsproxy(struct nsproxy *ns) * On success, returns the new nsproxy. */ int unshare_nsproxy_namespaces(unsigned long unshare_flags, - struct nsproxy **new_nsp, struct cred *new_cred, struct fs_struct *new_fs) + struct nsproxy **new_nsp, struct fs_struct *new_fs) { - struct user_namespace *user_ns; int err = 0; if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | - CLONE_NEWNET | CLONE_NEWPID))) + CLONE_NEWNET))) return 0; - user_ns = new_cred ? new_cred->user_ns : current_user_ns(); - if (!ns_capable(user_ns, CAP_SYS_ADMIN)) + if (!capable(CAP_SYS_ADMIN)) return -EPERM; - *new_nsp = create_new_namespaces(unshare_flags, current, user_ns, - new_fs ? new_fs : current->fs); + *new_nsp = create_new_namespaces(unshare_flags, current, + new_fs ? new_fs : current->fs); if (IS_ERR(*new_nsp)) { err = PTR_ERR(*new_nsp); goto out; @@ -246,6 +241,9 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype) struct file *file; int err; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + file = proc_ns_fget(fd); if (IS_ERR(file)) return PTR_ERR(file); @@ -256,7 +254,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype) if (nstype && (ops->type != nstype)) goto out; - new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs); + new_nsproxy = create_new_namespaces(0, tsk, tsk->fs); if (IS_ERR(new_nsproxy)) { err = PTR_ERR(new_nsproxy); goto out; diff --git a/trunk/kernel/pid.c b/trunk/kernel/pid.c index 3e2cf8100acc..fd996c1ed9f8 100644 --- a/trunk/kernel/pid.c +++ b/trunk/kernel/pid.c @@ -36,7 +36,6 @@ #include #include #include -#include #define pid_hashfn(nr, ns) \ hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) @@ -79,8 +78,6 @@ struct pid_namespace init_pid_ns = { .last_pid = 0, .level = 0, .child_reaper = &init_task, - .user_ns = &init_user_ns, - .proc_inum = PROC_PID_INIT_INO, }; EXPORT_SYMBOL_GPL(init_pid_ns); @@ -272,24 +269,8 @@ void free_pid(struct pid *pid) unsigned long flags; spin_lock_irqsave(&pidmap_lock, flags); - for (i = 0; i <= pid->level; i++) { - struct upid *upid = pid->numbers + i; - struct pid_namespace *ns = upid->ns; - hlist_del_rcu(&upid->pid_chain); - switch(--ns->nr_hashed) { - case 1: - /* When all that is left in the pid namespace - * is the reaper wake up the reaper. The reaper - * may be sleeping in zap_pid_ns_processes(). - */ - wake_up_process(ns->child_reaper); - break; - case 0: - ns->nr_hashed = -1; - schedule_work(&ns->proc_work); - break; - } - } + for (i = 0; i <= pid->level; i++) + hlist_del_rcu(&pid->numbers[i].pid_chain); spin_unlock_irqrestore(&pidmap_lock, flags); for (i = 0; i <= pid->level; i++) @@ -311,7 +292,6 @@ struct pid *alloc_pid(struct pid_namespace *ns) goto out; tmp = ns; - pid->level = ns->level; for (i = ns->level; i >= 0; i--) { nr = alloc_pidmap(tmp); if (nr < 0) @@ -322,32 +302,22 @@ struct pid *alloc_pid(struct pid_namespace *ns) tmp = tmp->parent; } - if (unlikely(is_child_reaper(pid))) { - if (pid_ns_prepare_proc(ns)) - goto out_free; - } - get_pid_ns(ns); + pid->level = ns->level; atomic_set(&pid->count, 1); for (type = 0; type < PIDTYPE_MAX; ++type) INIT_HLIST_HEAD(&pid->tasks[type]); upid = pid->numbers + ns->level; spin_lock_irq(&pidmap_lock); - if (ns->nr_hashed < 0) - goto out_unlock; - for ( ; upid >= pid->numbers; --upid) { + for ( ; upid >= pid->numbers; --upid) hlist_add_head_rcu(&upid->pid_chain, &pid_hash[pid_hashfn(upid->nr, upid->ns)]); - upid->ns->nr_hashed++; - } spin_unlock_irq(&pidmap_lock); out: return pid; -out_unlock: - spin_unlock(&pidmap_lock); out_free: while (++i <= ns->level) free_pidmap(pid->numbers + i); @@ -374,7 +344,7 @@ EXPORT_SYMBOL_GPL(find_pid_ns); struct pid *find_vpid(int nr) { - return find_pid_ns(nr, task_active_pid_ns(current)); + return find_pid_ns(nr, current->nsproxy->pid_ns); } EXPORT_SYMBOL_GPL(find_vpid); @@ -458,7 +428,7 @@ struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) struct task_struct *find_task_by_vpid(pid_t vnr) { - return find_task_by_pid_ns(vnr, task_active_pid_ns(current)); + return find_task_by_pid_ns(vnr, current->nsproxy->pid_ns); } struct pid *get_task_pid(struct task_struct *task, enum pid_type type) @@ -513,7 +483,7 @@ EXPORT_SYMBOL_GPL(pid_nr_ns); pid_t pid_vnr(struct pid *pid) { - return pid_nr_ns(pid, task_active_pid_ns(current)); + return pid_nr_ns(pid, current->nsproxy->pid_ns); } EXPORT_SYMBOL_GPL(pid_vnr); @@ -524,7 +494,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, rcu_read_lock(); if (!ns) - ns = task_active_pid_ns(current); + ns = current->nsproxy->pid_ns; if (likely(pid_alive(task))) { if (type != PIDTYPE_PID) task = task->group_leader; @@ -599,7 +569,6 @@ void __init pidmap_init(void) /* Reserve PID 0. We never call free_pidmap(0) */ set_bit(0, init_pid_ns.pidmap[0].page); atomic_dec(&init_pid_ns.pidmap[0].nr_free); - init_pid_ns.nr_hashed = 1; init_pid_ns.pid_cachep = KMEM_CACHE(pid, SLAB_HWCACHE_ALIGN | SLAB_PANIC); diff --git a/trunk/kernel/pid_namespace.c b/trunk/kernel/pid_namespace.c index 560da0dab230..7b07cc0dfb75 100644 --- a/trunk/kernel/pid_namespace.c +++ b/trunk/kernel/pid_namespace.c @@ -10,7 +10,6 @@ #include #include -#include #include #include #include @@ -72,17 +71,10 @@ static struct kmem_cache *create_pid_cachep(int nr_ids) return NULL; } -static void proc_cleanup_work(struct work_struct *work) -{ - struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_work); - pid_ns_release_proc(ns); -} - /* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ #define MAX_PID_NS_LEVEL 32 -static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns, - struct pid_namespace *parent_pid_ns) +static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_pid_ns) { struct pid_namespace *ns; unsigned int level = parent_pid_ns->level + 1; @@ -107,15 +99,9 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns if (ns->pid_cachep == NULL) goto out_free_map; - err = proc_alloc_inum(&ns->proc_inum); - if (err) - goto out_free_map; - kref_init(&ns->kref); ns->level = level; ns->parent = get_pid_ns(parent_pid_ns); - ns->user_ns = get_user_ns(user_ns); - INIT_WORK(&ns->proc_work, proc_cleanup_work); set_bit(0, ns->pidmap[0].page); atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); @@ -123,8 +109,14 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns for (i = 1; i < PIDMAP_ENTRIES; i++) atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); + err = pid_ns_prepare_proc(ns); + if (err) + goto out_put_parent_pid_ns; + return ns; +out_put_parent_pid_ns: + put_pid_ns(parent_pid_ns); out_free_map: kfree(ns->pidmap[0].page); out_free: @@ -137,21 +129,18 @@ static void destroy_pid_namespace(struct pid_namespace *ns) { int i; - proc_free_inum(ns->proc_inum); for (i = 0; i < PIDMAP_ENTRIES; i++) kfree(ns->pidmap[i].page); - put_user_ns(ns->user_ns); kmem_cache_free(pid_ns_cachep, ns); } -struct pid_namespace *copy_pid_ns(unsigned long flags, - struct user_namespace *user_ns, struct pid_namespace *old_ns) +struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns) { if (!(flags & CLONE_NEWPID)) return get_pid_ns(old_ns); - if (task_active_pid_ns(current) != old_ns) + if (flags & (CLONE_THREAD|CLONE_PARENT)) return ERR_PTR(-EINVAL); - return create_pid_namespace(user_ns, old_ns); + return create_pid_namespace(old_ns); } static void free_pid_ns(struct kref *kref) @@ -222,15 +211,22 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) /* * sys_wait4() above can't reap the TASK_DEAD children. - * Make sure they all go away, see free_pid(). + * Make sure they all go away, see __unhash_process(). */ for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); - if (pid_ns->nr_hashed == 1) + bool need_wait = false; + + read_lock(&tasklist_lock); + if (!list_empty(¤t->children)) { + __set_current_state(TASK_UNINTERRUPTIBLE); + need_wait = true; + } + read_unlock(&tasklist_lock); + + if (!need_wait) break; schedule(); } - __set_current_state(TASK_RUNNING); if (pid_ns->reboot) current->signal->group_exit_code = pid_ns->reboot; @@ -243,10 +239,9 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) static int pid_ns_ctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - struct pid_namespace *pid_ns = task_active_pid_ns(current); struct ctl_table tmp = *table; - if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN)) + if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; /* @@ -255,7 +250,7 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write, * it should synchronize its usage with external means. */ - tmp.data = &pid_ns->last_pid; + tmp.data = ¤t->nsproxy->pid_ns->last_pid; return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); } @@ -304,67 +299,6 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) return 0; } -static void *pidns_get(struct task_struct *task) -{ - struct pid_namespace *ns; - - rcu_read_lock(); - ns = get_pid_ns(task_active_pid_ns(task)); - rcu_read_unlock(); - - return ns; -} - -static void pidns_put(void *ns) -{ - put_pid_ns(ns); -} - -static int pidns_install(struct nsproxy *nsproxy, void *ns) -{ - struct pid_namespace *active = task_active_pid_ns(current); - struct pid_namespace *ancestor, *new = ns; - - if (!ns_capable(new->user_ns, CAP_SYS_ADMIN)) - return -EPERM; - - /* - * Only allow entering the current active pid namespace - * or a child of the current active pid namespace. - * - * This is required for fork to return a usable pid value and - * this maintains the property that processes and their - * children can not escape their current pid namespace. - */ - if (new->level < active->level) - return -EINVAL; - - ancestor = new; - while (ancestor->level > active->level) - ancestor = ancestor->parent; - if (ancestor != active) - return -EINVAL; - - put_pid_ns(nsproxy->pid_ns); - nsproxy->pid_ns = get_pid_ns(new); - return 0; -} - -static unsigned int pidns_inum(void *ns) -{ - struct pid_namespace *pid_ns = ns; - return pid_ns->proc_inum; -} - -const struct proc_ns_operations pidns_operations = { - .name = "pid", - .type = CLONE_NEWPID, - .get = pidns_get, - .put = pidns_put, - .install = pidns_install, - .inum = pidns_inum, -}; - static __init int pid_namespaces_init(void) { pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); diff --git a/trunk/kernel/ptrace.c b/trunk/kernel/ptrace.c index 7b09b88862cc..1f5e55dda955 100644 --- a/trunk/kernel/ptrace.c +++ b/trunk/kernel/ptrace.c @@ -215,12 +215,8 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode) smp_rmb(); if (task->mm) dumpable = get_dumpable(task->mm); - rcu_read_lock(); - if (!dumpable && !ptrace_has_cap(__task_cred(task)->user_ns, mode)) { - rcu_read_unlock(); + if (!dumpable && !ptrace_has_cap(task_user_ns(task), mode)) return -EPERM; - } - rcu_read_unlock(); return security_ptrace_access_check(task, mode); } @@ -284,10 +280,8 @@ static int ptrace_attach(struct task_struct *task, long request, if (seize) flags |= PT_SEIZED; - rcu_read_lock(); - if (ns_capable(__task_cred(task)->user_ns, CAP_SYS_PTRACE)) + if (ns_capable(task_user_ns(task), CAP_SYS_PTRACE)) flags |= PT_PTRACE_CAP; - rcu_read_unlock(); task->ptrace = flags; __ptrace_link(task, current); diff --git a/trunk/kernel/sched/core.c b/trunk/kernel/sched/core.c index 257002c13bb0..c1fb82104bfb 100644 --- a/trunk/kernel/sched/core.c +++ b/trunk/kernel/sched/core.c @@ -4097,14 +4097,8 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) goto out_free_cpus_allowed; } retval = -EPERM; - if (!check_same_owner(p)) { - rcu_read_lock(); - if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { - rcu_read_unlock(); - goto out_unlock; - } - rcu_read_unlock(); - } + if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE)) + goto out_unlock; retval = security_task_setscheduler(p); if (retval) diff --git a/trunk/kernel/signal.c b/trunk/kernel/signal.c index 580a91e63471..a49c7f36ceb3 100644 --- a/trunk/kernel/signal.c +++ b/trunk/kernel/signal.c @@ -1753,7 +1753,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, * see comment in do_notify_parent() about the following 4 lines */ rcu_read_lock(); - info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(parent)); + info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk)); rcu_read_unlock(); diff --git a/trunk/kernel/sysctl_binary.c b/trunk/kernel/sysctl_binary.c index 5a6384450501..65bdcf198d4e 100644 --- a/trunk/kernel/sysctl_binary.c +++ b/trunk/kernel/sysctl_binary.c @@ -1344,7 +1344,7 @@ static ssize_t binary_sysctl(const int *name, int nlen, goto out_putname; } - mnt = task_active_pid_ns(current)->proc_mnt; + mnt = current->nsproxy->pid_ns->proc_mnt; file = file_open_root(mnt->mnt_root, mnt, pathname, flags); result = PTR_ERR(file); if (IS_ERR(file)) diff --git a/trunk/kernel/user.c b/trunk/kernel/user.c index 33acb5e53a5f..750acffbe9ec 100644 --- a/trunk/kernel/user.c +++ b/trunk/kernel/user.c @@ -16,7 +16,6 @@ #include #include #include -#include /* * userns count is 1 for root user, 1 for init_uts_ns, @@ -52,7 +51,6 @@ struct user_namespace init_user_ns = { }, .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, - .proc_inum = PROC_USER_INIT_INO, }; EXPORT_SYMBOL_GPL(init_user_ns); diff --git a/trunk/kernel/user_namespace.c b/trunk/kernel/user_namespace.c index f5975ccf9348..456a6b9fba34 100644 --- a/trunk/kernel/user_namespace.c +++ b/trunk/kernel/user_namespace.c @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include @@ -27,24 +26,6 @@ static struct kmem_cache *user_ns_cachep __read_mostly; static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, struct uid_gid_map *map); -static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) -{ - /* Start with the same capabilities as init but useless for doing - * anything as the capabilities are bound to the new user namespace. - */ - cred->securebits = SECUREBITS_DEFAULT; - cred->cap_inheritable = CAP_EMPTY_SET; - cred->cap_permitted = CAP_FULL_SET; - cred->cap_effective = CAP_FULL_SET; - cred->cap_bset = CAP_FULL_SET; -#ifdef CONFIG_KEYS - key_put(cred->request_key_auth); - cred->request_key_auth = NULL; -#endif - /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ - cred->user_ns = user_ns; -} - /* * Create a new user namespace, deriving the creator from the user in the * passed credentials, and replacing that user with the new root user for the @@ -58,7 +39,6 @@ int create_user_ns(struct cred *new) struct user_namespace *ns, *parent_ns = new->user_ns; kuid_t owner = new->euid; kgid_t group = new->egid; - int ret; /* The creator needs a mapping in the parent user namespace * or else we won't be able to reasonably tell userspace who @@ -72,36 +52,30 @@ int create_user_ns(struct cred *new) if (!ns) return -ENOMEM; - ret = proc_alloc_inum(&ns->proc_inum); - if (ret) { - kmem_cache_free(user_ns_cachep, ns); - return ret; - } - kref_init(&ns->kref); - /* Leave the new->user_ns reference with the new user namespace. */ ns->parent = parent_ns; ns->owner = owner; ns->group = group; - set_cred_user_ns(new, ns); - - return 0; -} - -int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) -{ - struct cred *cred; - - if (!(unshare_flags & CLONE_NEWUSER)) - return 0; + /* Start with the same capabilities as init but useless for doing + * anything as the capabilities are bound to the new user namespace. + */ + new->securebits = SECUREBITS_DEFAULT; + new->cap_inheritable = CAP_EMPTY_SET; + new->cap_permitted = CAP_FULL_SET; + new->cap_effective = CAP_FULL_SET; + new->cap_bset = CAP_FULL_SET; +#ifdef CONFIG_KEYS + key_put(new->request_key_auth); + new->request_key_auth = NULL; +#endif + /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ - cred = prepare_creds(); - if (!cred) - return -ENOMEM; + /* Leave the new->user_ns reference with the new user namespace. */ + /* Leave the reference to our user_ns with the new cred. */ + new->user_ns = ns; - *new_cred = cred; - return create_user_ns(cred); + return 0; } void free_user_ns(struct kref *kref) @@ -110,7 +84,6 @@ void free_user_ns(struct kref *kref) container_of(kref, struct user_namespace, kref); parent = ns->parent; - proc_free_inum(ns->proc_inum); kmem_cache_free(user_ns_cachep, ns); put_user_ns(parent); } @@ -399,7 +372,7 @@ static int uid_m_show(struct seq_file *seq, void *v) struct user_namespace *lower_ns; uid_t lower; - lower_ns = seq_user_ns(seq); + lower_ns = current_user_ns(); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; @@ -420,7 +393,7 @@ static int gid_m_show(struct seq_file *seq, void *v) struct user_namespace *lower_ns; gid_t lower; - lower_ns = seq_user_ns(seq); + lower_ns = current_user_ns(); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; @@ -696,14 +669,10 @@ ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t siz { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; - struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; - if ((seq_ns != ns) && (seq_ns != ns->parent)) - return -EPERM; - return map_write(file, buf, size, ppos, CAP_SETUID, &ns->uid_map, &ns->parent->uid_map); } @@ -712,14 +681,10 @@ ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t siz { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; - struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; - if ((seq_ns != ns) && (seq_ns != ns->parent)) - return -EPERM; - return map_write(file, buf, size, ppos, CAP_SETGID, &ns->gid_map, &ns->parent->gid_map); } @@ -744,21 +709,6 @@ ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, struct uid_gid_map *new_map) { - /* Allow mapping to your own filesystem ids */ - if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1)) { - u32 id = new_map->extent[0].lower_first; - if (cap_setid == CAP_SETUID) { - kuid_t uid = make_kuid(ns->parent, id); - if (uid_eq(uid, current_fsuid())) - return true; - } - else if (cap_setid == CAP_SETGID) { - kgid_t gid = make_kgid(ns->parent, id); - if (gid_eq(gid, current_fsgid())) - return true; - } - } - /* Allow anyone to set a mapping that doesn't require privilege */ if (!cap_valid(cap_setid)) return true; @@ -772,65 +722,6 @@ static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, return false; } -static void *userns_get(struct task_struct *task) -{ - struct user_namespace *user_ns; - - rcu_read_lock(); - user_ns = get_user_ns(__task_cred(task)->user_ns); - rcu_read_unlock(); - - return user_ns; -} - -static void userns_put(void *ns) -{ - put_user_ns(ns); -} - -static int userns_install(struct nsproxy *nsproxy, void *ns) -{ - struct user_namespace *user_ns = ns; - struct cred *cred; - - /* Don't allow gaining capabilities by reentering - * the same user namespace. - */ - if (user_ns == current_user_ns()) - return -EINVAL; - - /* Threaded many not enter a different user namespace */ - if (atomic_read(¤t->mm->mm_users) > 1) - return -EINVAL; - - if (!ns_capable(user_ns, CAP_SYS_ADMIN)) - return -EPERM; - - cred = prepare_creds(); - if (!cred) - return -ENOMEM; - - put_user_ns(cred->user_ns); - set_cred_user_ns(cred, get_user_ns(user_ns)); - - return commit_creds(cred); -} - -static unsigned int userns_inum(void *ns) -{ - struct user_namespace *user_ns = ns; - return user_ns->proc_inum; -} - -const struct proc_ns_operations userns_operations = { - .name = "user", - .type = CLONE_NEWUSER, - .get = userns_get, - .put = userns_put, - .install = userns_install, - .inum = userns_inum, -}; - static __init int user_namespaces_init(void) { user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC); diff --git a/trunk/kernel/utsname.c b/trunk/kernel/utsname.c index f6336d51d64c..679d97a5d3fd 100644 --- a/trunk/kernel/utsname.c +++ b/trunk/kernel/utsname.c @@ -32,25 +32,18 @@ static struct uts_namespace *create_uts_ns(void) * @old_ns: namespace to clone * Return NULL on error (failure to kmalloc), new ns otherwise */ -static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, +static struct uts_namespace *clone_uts_ns(struct task_struct *tsk, struct uts_namespace *old_ns) { struct uts_namespace *ns; - int err; ns = create_uts_ns(); if (!ns) return ERR_PTR(-ENOMEM); - err = proc_alloc_inum(&ns->proc_inum); - if (err) { - kfree(ns); - return ERR_PTR(err); - } - down_read(&uts_sem); memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); - ns->user_ns = get_user_ns(user_ns); + ns->user_ns = get_user_ns(task_cred_xxx(tsk, user_ns)); up_read(&uts_sem); return ns; } @@ -62,8 +55,9 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, * versa. */ struct uts_namespace *copy_utsname(unsigned long flags, - struct user_namespace *user_ns, struct uts_namespace *old_ns) + struct task_struct *tsk) { + struct uts_namespace *old_ns = tsk->nsproxy->uts_ns; struct uts_namespace *new_ns; BUG_ON(!old_ns); @@ -72,7 +66,7 @@ struct uts_namespace *copy_utsname(unsigned long flags, if (!(flags & CLONE_NEWUTS)) return old_ns; - new_ns = clone_uts_ns(user_ns, old_ns); + new_ns = clone_uts_ns(tsk, old_ns); put_uts_ns(old_ns); return new_ns; @@ -84,7 +78,6 @@ void free_uts_ns(struct kref *kref) ns = container_of(kref, struct uts_namespace, kref); put_user_ns(ns->user_ns); - proc_free_inum(ns->proc_inum); kfree(ns); } @@ -109,31 +102,19 @@ static void utsns_put(void *ns) put_uts_ns(ns); } -static int utsns_install(struct nsproxy *nsproxy, void *new) +static int utsns_install(struct nsproxy *nsproxy, void *ns) { - struct uts_namespace *ns = new; - - if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) - return -EPERM; - get_uts_ns(ns); put_uts_ns(nsproxy->uts_ns); nsproxy->uts_ns = ns; return 0; } -static unsigned int utsns_inum(void *vp) -{ - struct uts_namespace *ns = vp; - - return ns->proc_inum; -} - const struct proc_ns_operations utsns_operations = { .name = "uts", .type = CLONE_NEWUTS, .get = utsns_get, .put = utsns_put, .install = utsns_install, - .inum = utsns_inum, }; + diff --git a/trunk/lib/lru_cache.c b/trunk/lib/lru_cache.c index d71d89498943..a07e7268d7ed 100644 --- a/trunk/lib/lru_cache.c +++ b/trunk/lib/lru_cache.c @@ -44,8 +44,8 @@ MODULE_LICENSE("GPL"); } while (0) #define RETURN(x...) do { \ - clear_bit_unlock(__LC_PARANOIA, &lc->flags); \ - return x ; } while (0) + clear_bit(__LC_PARANOIA, &lc->flags); \ + smp_mb__after_clear_bit(); return x ; } while (0) /* BUG() if e is not one of the elements tracked by lc */ #define PARANOIA_LC_ELEMENT(lc, e) do { \ @@ -55,40 +55,9 @@ MODULE_LICENSE("GPL"); BUG_ON(i >= lc_->nr_elements); \ BUG_ON(lc_->lc_element[i] != e_); } while (0) - -/* We need to atomically - * - try to grab the lock (set LC_LOCKED) - * - only if there is no pending transaction - * (neither LC_DIRTY nor LC_STARVING is set) - * Because of PARANOIA_ENTRY() above abusing lc->flags as well, - * it is not sufficient to just say - * return 0 == cmpxchg(&lc->flags, 0, LC_LOCKED); - */ -int lc_try_lock(struct lru_cache *lc) -{ - unsigned long val; - do { - val = cmpxchg(&lc->flags, 0, LC_LOCKED); - } while (unlikely (val == LC_PARANOIA)); - /* Spin until no-one is inside a PARANOIA_ENTRY()/RETURN() section. */ - return 0 == val; -#if 0 - /* Alternative approach, spin in case someone enters or leaves a - * PARANOIA_ENTRY()/RETURN() section. */ - unsigned long old, new, val; - do { - old = lc->flags & LC_PARANOIA; - new = old | LC_LOCKED; - val = cmpxchg(&lc->flags, old, new); - } while (unlikely (val == (old ^ LC_PARANOIA))); - return old == val; -#endif -} - /** * lc_create - prepares to track objects in an active set * @name: descriptive name only used in lc_seq_printf_stats and lc_seq_dump_details - * @max_pending_changes: maximum changes to accumulate until a transaction is required * @e_count: number of elements allowed to be active simultaneously * @e_size: size of the tracked objects * @e_off: offset to the &struct lc_element member in a tracked object @@ -97,7 +66,6 @@ int lc_try_lock(struct lru_cache *lc) * or NULL on (allocation) failure. */ struct lru_cache *lc_create(const char *name, struct kmem_cache *cache, - unsigned max_pending_changes, unsigned e_count, size_t e_size, size_t e_off) { struct hlist_head *slot = NULL; @@ -130,13 +98,12 @@ struct lru_cache *lc_create(const char *name, struct kmem_cache *cache, INIT_LIST_HEAD(&lc->in_use); INIT_LIST_HEAD(&lc->lru); INIT_LIST_HEAD(&lc->free); - INIT_LIST_HEAD(&lc->to_be_changed); lc->name = name; lc->element_size = e_size; lc->element_off = e_off; lc->nr_elements = e_count; - lc->max_pending_changes = max_pending_changes; + lc->new_number = LC_FREE; lc->lc_cache = cache; lc->lc_element = element; lc->lc_slot = slot; @@ -150,7 +117,6 @@ struct lru_cache *lc_create(const char *name, struct kmem_cache *cache, e = p + e_off; e->lc_index = i; e->lc_number = LC_FREE; - e->lc_new_number = LC_FREE; list_add(&e->list, &lc->free); element[i] = e; } @@ -209,15 +175,15 @@ void lc_reset(struct lru_cache *lc) INIT_LIST_HEAD(&lc->in_use); INIT_LIST_HEAD(&lc->lru); INIT_LIST_HEAD(&lc->free); - INIT_LIST_HEAD(&lc->to_be_changed); lc->used = 0; lc->hits = 0; lc->misses = 0; lc->starving = 0; - lc->locked = 0; + lc->dirty = 0; lc->changed = 0; - lc->pending_changes = 0; lc->flags = 0; + lc->changing_element = NULL; + lc->new_number = LC_FREE; memset(lc->lc_slot, 0, sizeof(struct hlist_head) * lc->nr_elements); for (i = 0; i < lc->nr_elements; i++) { @@ -228,7 +194,6 @@ void lc_reset(struct lru_cache *lc) /* re-init it */ e->lc_index = i; e->lc_number = LC_FREE; - e->lc_new_number = LC_FREE; list_add(&e->list, &lc->free); } } @@ -243,14 +208,14 @@ size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc) /* NOTE: * total calls to lc_get are * (starving + hits + misses) - * misses include "locked" count (update from an other thread in + * misses include "dirty" count (update from an other thread in * progress) and "changed", when this in fact lead to an successful * update of the cache. */ return seq_printf(seq, "\t%s: used:%u/%u " - "hits:%lu misses:%lu starving:%lu locked:%lu changed:%lu\n", + "hits:%lu misses:%lu starving:%lu dirty:%lu changed:%lu\n", lc->name, lc->used, lc->nr_elements, - lc->hits, lc->misses, lc->starving, lc->locked, lc->changed); + lc->hits, lc->misses, lc->starving, lc->dirty, lc->changed); } static struct hlist_head *lc_hash_slot(struct lru_cache *lc, unsigned int enr) @@ -259,27 +224,6 @@ static struct hlist_head *lc_hash_slot(struct lru_cache *lc, unsigned int enr) } -static struct lc_element *__lc_find(struct lru_cache *lc, unsigned int enr, - bool include_changing) -{ - struct hlist_node *n; - struct lc_element *e; - - BUG_ON(!lc); - BUG_ON(!lc->nr_elements); - hlist_for_each_entry(e, n, lc_hash_slot(lc, enr), colision) { - /* "about to be changed" elements, pending transaction commit, - * are hashed by their "new number". "Normal" elements have - * lc_number == lc_new_number. */ - if (e->lc_new_number != enr) - continue; - if (e->lc_new_number == e->lc_number || include_changing) - return e; - break; - } - return NULL; -} - /** * lc_find - find element by label, if present in the hash table * @lc: The lru_cache object @@ -288,28 +232,38 @@ static struct lc_element *__lc_find(struct lru_cache *lc, unsigned int enr, * Returns the pointer to an element, if the element with the requested * "label" or element number is present in the hash table, * or NULL if not found. Does not change the refcnt. - * Ignores elements that are "about to be used", i.e. not yet in the active - * set, but still pending transaction commit. */ struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr) { - return __lc_find(lc, enr, 0); + struct hlist_node *n; + struct lc_element *e; + + BUG_ON(!lc); + BUG_ON(!lc->nr_elements); + hlist_for_each_entry(e, n, lc_hash_slot(lc, enr), colision) { + if (e->lc_number == enr) + return e; + } + return NULL; } -/** - * lc_is_used - find element by label - * @lc: The lru_cache object - * @enr: element number - * - * Returns true, if the element with the requested "label" or element number is - * present in the hash table, and is used (refcnt > 0). - * Also finds elements that are not _currently_ used but only "about to be - * used", i.e. on the "to_be_changed" list, pending transaction commit. - */ -bool lc_is_used(struct lru_cache *lc, unsigned int enr) +/* returned element will be "recycled" immediately */ +static struct lc_element *lc_evict(struct lru_cache *lc) { - struct lc_element *e = __lc_find(lc, enr, 1); - return e && e->refcnt; + struct list_head *n; + struct lc_element *e; + + if (list_empty(&lc->lru)) + return NULL; + + n = lc->lru.prev; + e = list_entry(n, struct lc_element, list); + + PARANOIA_LC_ELEMENT(lc, e); + + list_del(&e->list); + hlist_del(&e->colision); + return e; } /** @@ -326,34 +280,22 @@ void lc_del(struct lru_cache *lc, struct lc_element *e) PARANOIA_LC_ELEMENT(lc, e); BUG_ON(e->refcnt); - e->lc_number = e->lc_new_number = LC_FREE; + e->lc_number = LC_FREE; hlist_del_init(&e->colision); list_move(&e->list, &lc->free); RETURN(); } -static struct lc_element *lc_prepare_for_change(struct lru_cache *lc, unsigned new_number) +static struct lc_element *lc_get_unused_element(struct lru_cache *lc) { struct list_head *n; - struct lc_element *e; - - if (!list_empty(&lc->free)) - n = lc->free.next; - else if (!list_empty(&lc->lru)) - n = lc->lru.prev; - else - return NULL; - - e = list_entry(n, struct lc_element, list); - PARANOIA_LC_ELEMENT(lc, e); - e->lc_new_number = new_number; - if (!hlist_unhashed(&e->colision)) - __hlist_del(&e->colision); - hlist_add_head(&e->colision, lc_hash_slot(lc, new_number)); - list_move(&e->list, &lc->to_be_changed); + if (list_empty(&lc->free)) + return lc_evict(lc); - return e; + n = lc->free.next; + list_del(n); + return list_entry(n, struct lc_element, list); } static int lc_unused_element_available(struct lru_cache *lc) @@ -366,7 +308,45 @@ static int lc_unused_element_available(struct lru_cache *lc) return 0; } -static struct lc_element *__lc_get(struct lru_cache *lc, unsigned int enr, bool may_change) + +/** + * lc_get - get element by label, maybe change the active set + * @lc: the lru cache to operate on + * @enr: the label to look up + * + * Finds an element in the cache, increases its usage count, + * "touches" and returns it. + * + * In case the requested number is not present, it needs to be added to the + * cache. Therefore it is possible that an other element becomes evicted from + * the cache. In either case, the user is notified so he is able to e.g. keep + * a persistent log of the cache changes, and therefore the objects in use. + * + * Return values: + * NULL + * The cache was marked %LC_STARVING, + * or the requested label was not in the active set + * and a changing transaction is still pending (@lc was marked %LC_DIRTY). + * Or no unused or free element could be recycled (@lc will be marked as + * %LC_STARVING, blocking further lc_get() operations). + * + * pointer to the element with the REQUESTED element number. + * In this case, it can be used right away + * + * pointer to an UNUSED element with some different element number, + * where that different number may also be %LC_FREE. + * + * In this case, the cache is marked %LC_DIRTY (blocking further changes), + * and the returned element pointer is removed from the lru list and + * hash collision chains. The user now should do whatever housekeeping + * is necessary. + * Then he must call lc_changed(lc,element_pointer), to finish + * the change. + * + * NOTE: The user needs to check the lc_number on EACH use, so he recognizes + * any cache set change. + */ +struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr) { struct lc_element *e; @@ -376,12 +356,8 @@ static struct lc_element *__lc_get(struct lru_cache *lc, unsigned int enr, bool RETURN(NULL); } - e = __lc_find(lc, enr, 1); - /* if lc_new_number != lc_number, - * this enr is currently being pulled in already, - * and will be available once the pending transaction - * has been committed. */ - if (e && e->lc_new_number == e->lc_number) { + e = lc_find(lc, enr); + if (e) { ++lc->hits; if (e->refcnt++ == 0) lc->used++; @@ -390,26 +366,6 @@ static struct lc_element *__lc_get(struct lru_cache *lc, unsigned int enr, bool } ++lc->misses; - if (!may_change) - RETURN(NULL); - - /* It has been found above, but on the "to_be_changed" list, not yet - * committed. Don't pull it in twice, wait for the transaction, then - * try again */ - if (e) - RETURN(NULL); - - /* To avoid races with lc_try_lock(), first, mark us dirty - * (using test_and_set_bit, as it implies memory barriers), ... */ - test_and_set_bit(__LC_DIRTY, &lc->flags); - - /* ... only then check if it is locked anyways. If lc_unlock clears - * the dirty bit again, that's not a problem, we will come here again. - */ - if (test_bit(__LC_LOCKED, &lc->flags)) { - ++lc->locked; - RETURN(NULL); - } /* In case there is nothing available and we can not kick out * the LRU element, we have to wait ... @@ -419,109 +375,71 @@ static struct lc_element *__lc_get(struct lru_cache *lc, unsigned int enr, bool RETURN(NULL); } - /* It was not present in the active set. We are going to recycle an - * unused (or even "free") element, but we won't accumulate more than - * max_pending_changes changes. */ - if (lc->pending_changes >= lc->max_pending_changes) + /* it was not present in the active set. + * we are going to recycle an unused (or even "free") element. + * user may need to commit a transaction to record that change. + * we serialize on flags & TF_DIRTY */ + if (test_and_set_bit(__LC_DIRTY, &lc->flags)) { + ++lc->dirty; RETURN(NULL); + } - e = lc_prepare_for_change(lc, enr); + e = lc_get_unused_element(lc); BUG_ON(!e); clear_bit(__LC_STARVING, &lc->flags); BUG_ON(++e->refcnt != 1); lc->used++; - lc->pending_changes++; - RETURN(e); -} + lc->changing_element = e; + lc->new_number = enr; -/** - * lc_get - get element by label, maybe change the active set - * @lc: the lru cache to operate on - * @enr: the label to look up - * - * Finds an element in the cache, increases its usage count, - * "touches" and returns it. - * - * In case the requested number is not present, it needs to be added to the - * cache. Therefore it is possible that an other element becomes evicted from - * the cache. In either case, the user is notified so he is able to e.g. keep - * a persistent log of the cache changes, and therefore the objects in use. - * - * Return values: - * NULL - * The cache was marked %LC_STARVING, - * or the requested label was not in the active set - * and a changing transaction is still pending (@lc was marked %LC_DIRTY). - * Or no unused or free element could be recycled (@lc will be marked as - * %LC_STARVING, blocking further lc_get() operations). - * - * pointer to the element with the REQUESTED element number. - * In this case, it can be used right away - * - * pointer to an UNUSED element with some different element number, - * where that different number may also be %LC_FREE. - * - * In this case, the cache is marked %LC_DIRTY, - * so lc_try_lock() will no longer succeed. - * The returned element pointer is moved to the "to_be_changed" list, - * and registered with the new element number on the hash collision chains, - * so it is possible to pick it up from lc_is_used(). - * Up to "max_pending_changes" (see lc_create()) can be accumulated. - * The user now should do whatever housekeeping is necessary, - * typically serialize on lc_try_lock_for_transaction(), then call - * lc_committed(lc) and lc_unlock(), to finish the change. - * - * NOTE: The user needs to check the lc_number on EACH use, so he recognizes - * any cache set change. - */ -struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr) -{ - return __lc_get(lc, enr, 1); + RETURN(e); } -/** - * lc_try_get - get element by label, if present; do not change the active set - * @lc: the lru cache to operate on - * @enr: the label to look up - * - * Finds an element in the cache, increases its usage count, - * "touches" and returns it. - * - * Return values: - * NULL - * The cache was marked %LC_STARVING, - * or the requested label was not in the active set - * - * pointer to the element with the REQUESTED element number. - * In this case, it can be used right away +/* similar to lc_get, + * but only gets a new reference on an existing element. + * you either get the requested element, or NULL. + * will be consolidated into one function. */ struct lc_element *lc_try_get(struct lru_cache *lc, unsigned int enr) { - return __lc_get(lc, enr, 0); + struct lc_element *e; + + PARANOIA_ENTRY(); + if (lc->flags & LC_STARVING) { + ++lc->starving; + RETURN(NULL); + } + + e = lc_find(lc, enr); + if (e) { + ++lc->hits; + if (e->refcnt++ == 0) + lc->used++; + list_move(&e->list, &lc->in_use); /* Not evictable... */ + } + RETURN(e); } /** - * lc_committed - tell @lc that pending changes have been recorded + * lc_changed - tell @lc that the change has been recorded * @lc: the lru cache to operate on - * - * User is expected to serialize on explicit lc_try_lock_for_transaction() - * before the transaction is started, and later needs to lc_unlock() explicitly - * as well. + * @e: the element pending label change */ -void lc_committed(struct lru_cache *lc) +void lc_changed(struct lru_cache *lc, struct lc_element *e) { - struct lc_element *e, *tmp; - PARANOIA_ENTRY(); - list_for_each_entry_safe(e, tmp, &lc->to_be_changed, list) { - /* count number of changes, not number of transactions */ - ++lc->changed; - e->lc_number = e->lc_new_number; - list_move(&e->list, &lc->in_use); - } - lc->pending_changes = 0; + BUG_ON(e != lc->changing_element); + PARANOIA_LC_ELEMENT(lc, e); + ++lc->changed; + e->lc_number = lc->new_number; + list_add(&e->list, &lc->in_use); + hlist_add_head(&e->colision, lc_hash_slot(lc, lc->new_number)); + lc->changing_element = NULL; + lc->new_number = LC_FREE; + clear_bit(__LC_DIRTY, &lc->flags); + smp_mb__after_clear_bit(); RETURN(); } @@ -540,12 +458,13 @@ unsigned int lc_put(struct lru_cache *lc, struct lc_element *e) PARANOIA_ENTRY(); PARANOIA_LC_ELEMENT(lc, e); BUG_ON(e->refcnt == 0); - BUG_ON(e->lc_number != e->lc_new_number); + BUG_ON(e == lc->changing_element); if (--e->refcnt == 0) { /* move it to the front of LRU. */ list_move(&e->list, &lc->lru); lc->used--; - clear_bit_unlock(__LC_STARVING, &lc->flags); + clear_bit(__LC_STARVING, &lc->flags); + smp_mb__after_clear_bit(); } RETURN(e->refcnt); } @@ -585,24 +504,16 @@ unsigned int lc_index_of(struct lru_cache *lc, struct lc_element *e) void lc_set(struct lru_cache *lc, unsigned int enr, int index) { struct lc_element *e; - struct list_head *lh; if (index < 0 || index >= lc->nr_elements) return; e = lc_element_by_index(lc, index); - BUG_ON(e->lc_number != e->lc_new_number); - BUG_ON(e->refcnt != 0); + e->lc_number = enr; - e->lc_number = e->lc_new_number = enr; hlist_del_init(&e->colision); - if (enr == LC_FREE) - lh = &lc->free; - else { - hlist_add_head(&e->colision, lc_hash_slot(lc, enr)); - lh = &lc->lru; - } - list_move(&e->list, lh); + hlist_add_head(&e->colision, lc_hash_slot(lc, enr)); + list_move(&e->list, e->refcnt ? &lc->in_use : &lc->lru); } /** @@ -642,10 +553,8 @@ EXPORT_SYMBOL(lc_try_get); EXPORT_SYMBOL(lc_find); EXPORT_SYMBOL(lc_get); EXPORT_SYMBOL(lc_put); -EXPORT_SYMBOL(lc_committed); +EXPORT_SYMBOL(lc_changed); EXPORT_SYMBOL(lc_element_by_index); EXPORT_SYMBOL(lc_index_of); EXPORT_SYMBOL(lc_seq_printf_stats); EXPORT_SYMBOL(lc_seq_dump_details); -EXPORT_SYMBOL(lc_try_lock); -EXPORT_SYMBOL(lc_is_used); diff --git a/trunk/mm/migrate.c b/trunk/mm/migrate.c index 3b676b0c5c3e..32efd8028bc9 100644 --- a/trunk/mm/migrate.c +++ b/trunk/mm/migrate.c @@ -1734,7 +1734,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, page_add_new_anon_rmap(new_page, vma, haddr); set_pmd_at(mm, haddr, pmd, entry); - update_mmu_cache_pmd(vma, address, &entry); + update_mmu_cache_pmd(vma, address, entry); page_remove_rmap(page); /* * Finish the charge transaction under the page table lock to diff --git a/trunk/net/core/net_namespace.c b/trunk/net/core/net_namespace.c index 2e9a3132b8dd..6456439cbbd9 100644 --- a/trunk/net/core/net_namespace.c +++ b/trunk/net/core/net_namespace.c @@ -381,21 +381,6 @@ struct net *get_net_ns_by_pid(pid_t pid) } EXPORT_SYMBOL_GPL(get_net_ns_by_pid); -static __net_init int net_ns_net_init(struct net *net) -{ - return proc_alloc_inum(&net->proc_inum); -} - -static __net_exit void net_ns_net_exit(struct net *net) -{ - proc_free_inum(net->proc_inum); -} - -static struct pernet_operations __net_initdata net_ns_ops = { - .init = net_ns_net_init, - .exit = net_ns_net_exit, -}; - static int __init net_ns_init(void) { struct net_generic *ng; @@ -427,8 +412,6 @@ static int __init net_ns_init(void) mutex_unlock(&net_mutex); - register_pernet_subsys(&net_ns_ops); - return 0; } @@ -647,28 +630,16 @@ static void netns_put(void *ns) static int netns_install(struct nsproxy *nsproxy, void *ns) { - struct net *net = ns; - - if (!ns_capable(net->user_ns, CAP_SYS_ADMIN)) - return -EPERM; - put_net(nsproxy->net_ns); - nsproxy->net_ns = get_net(net); + nsproxy->net_ns = get_net(ns); return 0; } -static unsigned int netns_inum(void *ns) -{ - struct net *net = ns; - return net->proc_inum; -} - const struct proc_ns_operations netns_operations = { .name = "net", .type = CLONE_NEWNET, .get = netns_get, .put = netns_put, .install = netns_install, - .inum = netns_inum, }; #endif diff --git a/trunk/security/yama/yama_lsm.c b/trunk/security/yama/yama_lsm.c index 23414b93771f..2663145d1197 100644 --- a/trunk/security/yama/yama_lsm.c +++ b/trunk/security/yama/yama_lsm.c @@ -298,18 +298,14 @@ int yama_ptrace_access_check(struct task_struct *child, /* No additional restrictions. */ break; case YAMA_SCOPE_RELATIONAL: - rcu_read_lock(); if (!task_is_descendant(current, child) && !ptracer_exception_found(current, child) && - !ns_capable(__task_cred(child)->user_ns, CAP_SYS_PTRACE)) + !ns_capable(task_user_ns(child), CAP_SYS_PTRACE)) rc = -EPERM; - rcu_read_unlock(); break; case YAMA_SCOPE_CAPABILITY: - rcu_read_lock(); - if (!ns_capable(__task_cred(child)->user_ns, CAP_SYS_PTRACE)) + if (!ns_capable(task_user_ns(child), CAP_SYS_PTRACE)) rc = -EPERM; - rcu_read_unlock(); break; case YAMA_SCOPE_NO_ATTACH: default: @@ -347,10 +343,8 @@ int yama_ptrace_traceme(struct task_struct *parent) /* Only disallow PTRACE_TRACEME on more aggressive settings. */ switch (ptrace_scope) { case YAMA_SCOPE_CAPABILITY: - rcu_read_lock(); - if (!ns_capable(__task_cred(parent)->user_ns, CAP_SYS_PTRACE)) + if (!ns_capable(task_user_ns(parent), CAP_SYS_PTRACE)) rc = -EPERM; - rcu_read_unlock(); break; case YAMA_SCOPE_NO_ATTACH: rc = -EPERM;