From a108861652fd28fba729903950a3c9d6260d5d48 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 27 Jul 2011 20:25:57 -0700 Subject: [PATCH] --- yaml --- r: 261079 b: refs/heads/master c: facfddef2c76110b8e321921f7e54518c3dd1579 h: refs/heads/master i: 261077: fca8212c9720e91000fa13506e825e34e8ebfc2d 261075: bcd5be58f3558d0d41621e16b4057e3e2f46ea98 261071: 65b296036afa9898fc71f476e9f3ce02be2eb912 v: v3 --- [refs] | 2 +- trunk/Documentation/md.txt | 29 +- trunk/arch/sparc/kernel/cpu.c | 1 + trunk/drivers/isdn/i4l/isdn_net.c | 3 - trunk/drivers/md/bitmap.c | 137 +- trunk/drivers/md/bitmap.h | 5 + trunk/drivers/md/md.c | 871 ++---------- trunk/drivers/md/md.h | 110 +- trunk/drivers/md/raid1.c | 962 ++++---------- trunk/drivers/md/raid1.h | 26 +- trunk/drivers/md/raid10.c | 1183 ++++------------- trunk/drivers/md/raid10.h | 21 - trunk/drivers/md/raid5.c | 1015 ++++++++------ trunk/drivers/md/raid5.h | 99 +- trunk/drivers/net/Makefile | 1 - trunk/drivers/net/acenic.c | 45 +- trunk/drivers/net/acenic.h | 6 +- trunk/drivers/net/bonding/bond_main.c | 8 +- trunk/drivers/net/bonding/bond_sysfs.c | 133 +- trunk/drivers/net/forcedeth.c | 16 +- trunk/drivers/net/gianfar.c | 6 +- trunk/drivers/net/ifb.c | 2 +- trunk/drivers/net/macvlan.c | 2 +- trunk/drivers/net/tg3.c | 287 ++-- trunk/drivers/net/tg3.h | 9 +- trunk/drivers/net/tun.c | 1 - trunk/drivers/net/usb/asix.c | 7 +- trunk/drivers/net/veth.c | 2 - trunk/drivers/net/wan/hdlc_fr.c | 5 +- trunk/drivers/net/wireless/airo.c | 1 - trunk/drivers/net/wireless/b43/Kconfig | 2 +- trunk/drivers/net/wireless/b43/bus.c | 2 - trunk/drivers/net/wireless/b43/main.c | 5 +- .../drivers/net/wireless/hostap/hostap_main.c | 1 - trunk/drivers/nfc/pn533.c | 2 +- .../staging/ath6kl/os/linux/ar6000_drv.c | 1 - trunk/include/linux/if.h | 2 - trunk/include/linux/netdevice.h | 7 +- trunk/include/linux/raid/md_p.h | 14 +- trunk/net/8021q/vlan_dev.c | 2 +- trunk/net/bluetooth/bnep/netdev.c | 1 - trunk/net/core/dev.c | 4 +- trunk/net/core/pktgen.c | 8 +- trunk/net/ethernet/eth.c | 2 - trunk/net/ipv4/devinet.c | 16 +- trunk/net/ipv6/addrconf.c | 2 - trunk/net/l2tp/l2tp_eth.c | 2 +- trunk/net/mac80211/iface.c | 1 - trunk/net/socket.c | 2 +- trunk/net/wireless/reg.c | 7 +- trunk/sound/oss/ad1848.c | 6 +- trunk/sound/oss/sb_mixer.c | 6 +- trunk/sound/pci/asihpi/hpioctl.c | 13 +- trunk/sound/pci/hda/patch_realtek.c | 29 +- trunk/sound/pci/hda/patch_sigmatel.c | 199 +-- 55 files changed, 1696 insertions(+), 3635 deletions(-) diff --git a/[refs] b/[refs] index 597cb1b06301..5349758feef7 100644 --- a/[refs] +++ b/[refs] @@ -1,2 +1,2 @@ --- -refs/heads/master: d5eab9152a3b4ce962c02ad0a0e4d0ec94aadd92 +refs/heads/master: facfddef2c76110b8e321921f7e54518c3dd1579 diff --git a/trunk/Documentation/md.txt b/trunk/Documentation/md.txt index fc94770f44ab..f0eee83ff78a 100644 --- a/trunk/Documentation/md.txt +++ b/trunk/Documentation/md.txt @@ -360,20 +360,18 @@ Each directory contains: A file recording the current state of the device in the array which can be a comma separated list of faulty - device has been kicked from active use due to - a detected fault or it has unacknowledged bad - blocks + a detected fault in_sync - device is a fully in-sync member of the array writemostly - device will only be subject to read requests if there are no other options. This applies only to raid1 arrays. - blocked - device has failed, and the failure hasn't been - acknowledged yet by the metadata handler. + blocked - device has failed, metadata is "external", + and the failure hasn't been acknowledged yet. Writes that would write to this device if it were not faulty are blocked. spare - device is working, but not a full member. This includes spares that are in the process of being recovered to - write_error - device has ever seen a write error. This list may grow in future. This can be written to. Writing "faulty" simulates a failure on the device. @@ -381,11 +379,9 @@ Each directory contains: Writing "writemostly" sets the writemostly flag. Writing "-writemostly" clears the writemostly flag. Writing "blocked" sets the "blocked" flag. - Writing "-blocked" clears the "blocked" flags and allows writes - to complete and possibly simulates an error. + Writing "-blocked" clears the "blocked" flag and allows writes + to complete. Writing "in_sync" sets the in_sync flag. - Writing "write_error" sets writeerrorseen flag. - Writing "-write_error" clears writeerrorseen flag. This file responds to select/poll. Any change to 'faulty' or 'blocked' causes an event. @@ -423,6 +419,7 @@ Each directory contains: written, it will be rejected. recovery_start + When the device is not 'in_sync', this records the number of sectors from the start of the device which are known to be correct. This is normally zero, but during a recovery @@ -438,20 +435,6 @@ Each directory contains: Setting this to 'none' is equivalent to setting 'in_sync'. Setting to any other value also clears the 'in_sync' flag. - bad_blocks - This gives the list of all known bad blocks in the form of - start address and length (in sectors respectively). If output - is too big to fit in a page, it will be truncated. Writing - "sector length" to this file adds new acknowledged (i.e. - recorded to disk safely) bad blocks. - - unacknowledged_bad_blocks - This gives the list of known-but-not-yet-saved-to-disk bad - blocks in the same form of 'bad_blocks'. If output is too big - to fit in a page, it will be truncated. Writing to this file - adds bad blocks without acknowledging them. This is largely - for testing. - An active md device will also contain and entry for each active device diff --git a/trunk/arch/sparc/kernel/cpu.c b/trunk/arch/sparc/kernel/cpu.c index 138dbbc8dc84..70f035c86c03 100644 --- a/trunk/arch/sparc/kernel/cpu.c +++ b/trunk/arch/sparc/kernel/cpu.c @@ -479,6 +479,7 @@ static void __init sun4v_cpu_probe(void) prom_cpu_compatible); sparc_cpu_type = "Unknown SUN4V CPU"; sparc_fpu_type = "Unknown SUN4V FPU"; + sparc_pmu_type = "Unknown SUN4V PMU"; break; } } diff --git a/trunk/drivers/isdn/i4l/isdn_net.c b/trunk/drivers/isdn/i4l/isdn_net.c index 1f73d7f7e024..48e9cc0369b1 100644 --- a/trunk/drivers/isdn/i4l/isdn_net.c +++ b/trunk/drivers/isdn/i4l/isdn_net.c @@ -2532,9 +2532,6 @@ static void _isdn_setup(struct net_device *dev) /* Setup the generic properties */ dev->flags = IFF_NOARP|IFF_POINTOPOINT; - - /* isdn prepends a header in the tx path, can't share skbs */ - dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->header_ops = NULL; dev->netdev_ops = &isdn_netdev_ops; diff --git a/trunk/drivers/md/bitmap.c b/trunk/drivers/md/bitmap.c index 0dc6546b77a8..574b09afedd3 100644 --- a/trunk/drivers/md/bitmap.c +++ b/trunk/drivers/md/bitmap.c @@ -29,6 +29,7 @@ #include "md.h" #include "bitmap.h" +#include /* debug macros */ #define DEBUG 0 @@ -774,8 +775,10 @@ static inline unsigned long file_page_offset(struct bitmap *bitmap, unsigned lon * 0 or page 1 */ static inline struct page *filemap_get_page(struct bitmap *bitmap, - unsigned long chunk) + unsigned long chunk) { + if (bitmap->filemap == NULL) + return NULL; if (file_page_index(bitmap, chunk) >= bitmap->file_pages) return NULL; return bitmap->filemap[file_page_index(bitmap, chunk) @@ -875,19 +878,28 @@ enum bitmap_page_attr { static inline void set_page_attr(struct bitmap *bitmap, struct page *page, enum bitmap_page_attr attr) { - __set_bit((page->index<<2) + attr, bitmap->filemap_attr); + if (page) + __set_bit((page->index<<2) + attr, bitmap->filemap_attr); + else + __set_bit(attr, &bitmap->logattrs); } static inline void clear_page_attr(struct bitmap *bitmap, struct page *page, enum bitmap_page_attr attr) { - __clear_bit((page->index<<2) + attr, bitmap->filemap_attr); + if (page) + __clear_bit((page->index<<2) + attr, bitmap->filemap_attr); + else + __clear_bit(attr, &bitmap->logattrs); } static inline unsigned long test_page_attr(struct bitmap *bitmap, struct page *page, enum bitmap_page_attr attr) { - return test_bit((page->index<<2) + attr, bitmap->filemap_attr); + if (page) + return test_bit((page->index<<2) + attr, bitmap->filemap_attr); + else + return test_bit(attr, &bitmap->logattrs); } /* @@ -900,26 +912,30 @@ static inline unsigned long test_page_attr(struct bitmap *bitmap, struct page *p static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) { unsigned long bit; - struct page *page; + struct page *page = NULL; void *kaddr; unsigned long chunk = block >> CHUNK_BLOCK_SHIFT(bitmap); - if (!bitmap->filemap) - return; + if (!bitmap->filemap) { + struct dm_dirty_log *log = bitmap->mddev->bitmap_info.log; + if (log) + log->type->mark_region(log, chunk); + } else { - page = filemap_get_page(bitmap, chunk); - if (!page) - return; - bit = file_page_offset(bitmap, chunk); + page = filemap_get_page(bitmap, chunk); + if (!page) + return; + bit = file_page_offset(bitmap, chunk); - /* set the bit */ - kaddr = kmap_atomic(page, KM_USER0); - if (bitmap->flags & BITMAP_HOSTENDIAN) - set_bit(bit, kaddr); - else - __set_bit_le(bit, kaddr); - kunmap_atomic(kaddr, KM_USER0); - PRINTK("set file bit %lu page %lu\n", bit, page->index); + /* set the bit */ + kaddr = kmap_atomic(page, KM_USER0); + if (bitmap->flags & BITMAP_HOSTENDIAN) + set_bit(bit, kaddr); + else + __test_and_set_bit_le(bit, kaddr); + kunmap_atomic(kaddr, KM_USER0); + PRINTK("set file bit %lu page %lu\n", bit, page->index); + } /* record page number so it gets flushed to disk when unplug occurs */ set_page_attr(bitmap, page, BITMAP_PAGE_DIRTY); } @@ -936,6 +952,16 @@ void bitmap_unplug(struct bitmap *bitmap) if (!bitmap) return; + if (!bitmap->filemap) { + /* Must be using a dirty_log */ + struct dm_dirty_log *log = bitmap->mddev->bitmap_info.log; + dirty = test_and_clear_bit(BITMAP_PAGE_DIRTY, &bitmap->logattrs); + need_write = test_and_clear_bit(BITMAP_PAGE_NEEDWRITE, &bitmap->logattrs); + if (dirty || need_write) + if (log->type->flush(log)) + bitmap->flags |= BITMAP_WRITE_ERROR; + goto out; + } /* look at each page to see if there are any set bits that need to be * flushed out to disk */ @@ -964,6 +990,7 @@ void bitmap_unplug(struct bitmap *bitmap) else md_super_wait(bitmap->mddev); } +out: if (bitmap->flags & BITMAP_WRITE_ERROR) bitmap_file_kick(bitmap); } @@ -1172,6 +1199,7 @@ void bitmap_daemon_work(mddev_t *mddev) struct page *page = NULL, *lastpage = NULL; sector_t blocks; void *paddr; + struct dm_dirty_log *log = mddev->bitmap_info.log; /* Use a mutex to guard daemon_work against * bitmap_destroy. @@ -1196,11 +1224,12 @@ void bitmap_daemon_work(mddev_t *mddev) spin_lock_irqsave(&bitmap->lock, flags); for (j = 0; j < bitmap->chunks; j++) { bitmap_counter_t *bmc; - if (!bitmap->filemap) - /* error or shutdown */ - break; - - page = filemap_get_page(bitmap, j); + if (!bitmap->filemap) { + if (!log) + /* error or shutdown */ + break; + } else + page = filemap_get_page(bitmap, j); if (page != lastpage) { /* skip this page unless it's marked as needing cleaning */ @@ -1269,16 +1298,17 @@ void bitmap_daemon_work(mddev_t *mddev) -1); /* clear the bit */ - paddr = kmap_atomic(page, KM_USER0); - if (bitmap->flags & BITMAP_HOSTENDIAN) - clear_bit(file_page_offset(bitmap, j), - paddr); - else - __clear_bit_le( - file_page_offset(bitmap, - j), - paddr); - kunmap_atomic(paddr, KM_USER0); + if (page) { + paddr = kmap_atomic(page, KM_USER0); + if (bitmap->flags & BITMAP_HOSTENDIAN) + clear_bit(file_page_offset(bitmap, j), + paddr); + else + __test_and_clear_bit_le(file_page_offset(bitmap, j), + paddr); + kunmap_atomic(paddr, KM_USER0); + } else + log->type->clear_region(log, j); } } else j |= PAGE_COUNTER_MASK; @@ -1286,12 +1316,16 @@ void bitmap_daemon_work(mddev_t *mddev) spin_unlock_irqrestore(&bitmap->lock, flags); /* now sync the final page */ - if (lastpage != NULL) { + if (lastpage != NULL || log != NULL) { spin_lock_irqsave(&bitmap->lock, flags); if (test_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE)) { clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); spin_unlock_irqrestore(&bitmap->lock, flags); - write_page(bitmap, lastpage, 0); + if (lastpage) + write_page(bitmap, lastpage, 0); + else + if (log->type->flush(log)) + bitmap->flags |= BITMAP_WRITE_ERROR; } else { set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); spin_unlock_irqrestore(&bitmap->lock, flags); @@ -1733,10 +1767,12 @@ int bitmap_create(mddev_t *mddev) BUILD_BUG_ON(sizeof(bitmap_super_t) != 256); if (!file - && !mddev->bitmap_info.offset) /* bitmap disabled, nothing to do */ + && !mddev->bitmap_info.offset + && !mddev->bitmap_info.log) /* bitmap disabled, nothing to do */ return 0; BUG_ON(file && mddev->bitmap_info.offset); + BUG_ON(mddev->bitmap_info.offset && mddev->bitmap_info.log); bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL); if (!bitmap) @@ -1827,7 +1863,6 @@ int bitmap_create(mddev_t *mddev) int bitmap_load(mddev_t *mddev) { int err = 0; - sector_t start = 0; sector_t sector = 0; struct bitmap *bitmap = mddev->bitmap; @@ -1846,14 +1881,24 @@ int bitmap_load(mddev_t *mddev) } bitmap_close_sync(bitmap); - if (mddev->degraded == 0 - || bitmap->events_cleared == mddev->events) - /* no need to keep dirty bits to optimise a - * re-add of a missing device */ - start = mddev->recovery_cp; - - err = bitmap_init_from_disk(bitmap, start); - + if (mddev->bitmap_info.log) { + unsigned long i; + struct dm_dirty_log *log = mddev->bitmap_info.log; + for (i = 0; i < bitmap->chunks; i++) + if (!log->type->in_sync(log, i, 1)) + bitmap_set_memory_bits(bitmap, + (sector_t)i << CHUNK_BLOCK_SHIFT(bitmap), + 1); + } else { + sector_t start = 0; + if (mddev->degraded == 0 + || bitmap->events_cleared == mddev->events) + /* no need to keep dirty bits to optimise a + * re-add of a missing device */ + start = mddev->recovery_cp; + + err = bitmap_init_from_disk(bitmap, start); + } if (err) goto out; diff --git a/trunk/drivers/md/bitmap.h b/trunk/drivers/md/bitmap.h index a28f2e5588c6..b2a127e891ac 100644 --- a/trunk/drivers/md/bitmap.h +++ b/trunk/drivers/md/bitmap.h @@ -212,6 +212,10 @@ struct bitmap { unsigned long file_pages; /* number of pages in the file */ int last_page_size; /* bytes in the last page */ + unsigned long logattrs; /* used when filemap_attr doesn't exist + * because we are working with a dirty_log + */ + unsigned long flags; int allclean; @@ -233,6 +237,7 @@ struct bitmap { wait_queue_head_t behind_wait; struct sysfs_dirent *sysfs_can_clear; + }; /* the bitmap API */ diff --git a/trunk/drivers/md/md.c b/trunk/drivers/md/md.c index 8e221a20f5d9..dfc9425db70b 100644 --- a/trunk/drivers/md/md.c +++ b/trunk/drivers/md/md.c @@ -215,55 +215,6 @@ struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, } EXPORT_SYMBOL_GPL(bio_clone_mddev); -void md_trim_bio(struct bio *bio, int offset, int size) -{ - /* 'bio' is a cloned bio which we need to trim to match - * the given offset and size. - * This requires adjusting bi_sector, bi_size, and bi_io_vec - */ - int i; - struct bio_vec *bvec; - int sofar = 0; - - size <<= 9; - if (offset == 0 && size == bio->bi_size) - return; - - bio->bi_sector += offset; - bio->bi_size = size; - offset <<= 9; - clear_bit(BIO_SEG_VALID, &bio->bi_flags); - - while (bio->bi_idx < bio->bi_vcnt && - bio->bi_io_vec[bio->bi_idx].bv_len <= offset) { - /* remove this whole bio_vec */ - offset -= bio->bi_io_vec[bio->bi_idx].bv_len; - bio->bi_idx++; - } - if (bio->bi_idx < bio->bi_vcnt) { - bio->bi_io_vec[bio->bi_idx].bv_offset += offset; - bio->bi_io_vec[bio->bi_idx].bv_len -= offset; - } - /* avoid any complications with bi_idx being non-zero*/ - if (bio->bi_idx) { - memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx, - (bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec)); - bio->bi_vcnt -= bio->bi_idx; - bio->bi_idx = 0; - } - /* Make sure vcnt and last bv are not too big */ - bio_for_each_segment(bvec, bio, i) { - if (sofar + bvec->bv_len > size) - bvec->bv_len = size - sofar; - if (bvec->bv_len == 0) { - bio->bi_vcnt = i; - break; - } - sofar += bvec->bv_len; - } -} -EXPORT_SYMBOL_GPL(md_trim_bio); - /* * We have a system wide 'event count' that is incremented * on any 'interesting' event, and readers of /proc/mdstat @@ -806,10 +757,6 @@ static void free_disk_sb(mdk_rdev_t * rdev) rdev->sb_start = 0; rdev->sectors = 0; } - if (rdev->bb_page) { - put_page(rdev->bb_page); - rdev->bb_page = NULL; - } } @@ -1078,7 +1025,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version ret = -EINVAL; bdevname(rdev->bdev, b); - sb = page_address(rdev->sb_page); + sb = (mdp_super_t*)page_address(rdev->sb_page); if (sb->md_magic != MD_SB_MAGIC) { printk(KERN_ERR "md: invalid raid superblock magic on %s\n", @@ -1107,7 +1054,6 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version rdev->preferred_minor = sb->md_minor; rdev->data_offset = 0; rdev->sb_size = MD_SB_BYTES; - rdev->badblocks.shift = -1; if (sb->level == LEVEL_MULTIPATH) rdev->desc_nr = -1; @@ -1118,7 +1064,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version ret = 1; } else { __u64 ev1, ev2; - mdp_super_t *refsb = page_address(refdev->sb_page); + mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page); if (!uuid_equal(refsb, sb)) { printk(KERN_WARNING "md: %s has different UUID to %s\n", b, bdevname(refdev->bdev,b2)); @@ -1153,7 +1099,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) { mdp_disk_t *desc; - mdp_super_t *sb = page_address(rdev->sb_page); + mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); __u64 ev1 = md_event(sb); rdev->raid_disk = -1; @@ -1284,7 +1230,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) rdev->sb_size = MD_SB_BYTES; - sb = page_address(rdev->sb_page); + sb = (mdp_super_t*)page_address(rdev->sb_page); memset(sb, 0, sizeof(*sb)); @@ -1449,8 +1395,6 @@ static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb) return cpu_to_le32(csum); } -static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, - int acknowledged); static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) { struct mdp_superblock_1 *sb; @@ -1491,7 +1435,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) if (ret) return ret; - sb = page_address(rdev->sb_page); + sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || sb->major_version != cpu_to_le32(1) || @@ -1529,52 +1473,12 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) else rdev->desc_nr = le32_to_cpu(sb->dev_number); - if (!rdev->bb_page) { - rdev->bb_page = alloc_page(GFP_KERNEL); - if (!rdev->bb_page) - return -ENOMEM; - } - if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) && - rdev->badblocks.count == 0) { - /* need to load the bad block list. - * Currently we limit it to one page. - */ - s32 offset; - sector_t bb_sector; - u64 *bbp; - int i; - int sectors = le16_to_cpu(sb->bblog_size); - if (sectors > (PAGE_SIZE / 512)) - return -EINVAL; - offset = le32_to_cpu(sb->bblog_offset); - if (offset == 0) - return -EINVAL; - bb_sector = (long long)offset; - if (!sync_page_io(rdev, bb_sector, sectors << 9, - rdev->bb_page, READ, true)) - return -EIO; - bbp = (u64 *)page_address(rdev->bb_page); - rdev->badblocks.shift = sb->bblog_shift; - for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) { - u64 bb = le64_to_cpu(*bbp); - int count = bb & (0x3ff); - u64 sector = bb >> 10; - sector <<= sb->bblog_shift; - count <<= sb->bblog_shift; - if (bb + 1 == 0) - break; - if (md_set_badblocks(&rdev->badblocks, - sector, count, 1) == 0) - return -EINVAL; - } - } else if (sb->bblog_offset == 0) - rdev->badblocks.shift = -1; - if (!refdev) { ret = 1; } else { __u64 ev1, ev2; - struct mdp_superblock_1 *refsb = page_address(refdev->sb_page); + struct mdp_superblock_1 *refsb = + (struct mdp_superblock_1*)page_address(refdev->sb_page); if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || sb->level != refsb->level || @@ -1609,7 +1513,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) { - struct mdp_superblock_1 *sb = page_address(rdev->sb_page); + struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); __u64 ev1 = le64_to_cpu(sb->events); rdev->raid_disk = -1; @@ -1715,12 +1619,13 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) int max_dev, i; /* make rdev->sb match mddev and rdev data. */ - sb = page_address(rdev->sb_page); + sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); sb->feature_map = 0; sb->pad0 = 0; sb->recovery_offset = cpu_to_le64(0); memset(sb->pad1, 0, sizeof(sb->pad1)); + memset(sb->pad2, 0, sizeof(sb->pad2)); memset(sb->pad3, 0, sizeof(sb->pad3)); sb->utime = cpu_to_le64((__u64)mddev->utime); @@ -1760,40 +1665,6 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); } - if (rdev->badblocks.count == 0) - /* Nothing to do for bad blocks*/ ; - else if (sb->bblog_offset == 0) - /* Cannot record bad blocks on this device */ - md_error(mddev, rdev); - else { - struct badblocks *bb = &rdev->badblocks; - u64 *bbp = (u64 *)page_address(rdev->bb_page); - u64 *p = bb->page; - sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS); - if (bb->changed) { - unsigned seq; - -retry: - seq = read_seqbegin(&bb->lock); - - memset(bbp, 0xff, PAGE_SIZE); - - for (i = 0 ; i < bb->count ; i++) { - u64 internal_bb = *p++; - u64 store_bb = ((BB_OFFSET(internal_bb) << 10) - | BB_LEN(internal_bb)); - *bbp++ = cpu_to_le64(store_bb); - } - if (read_seqretry(&bb->lock, seq)) - goto retry; - - bb->sector = (rdev->sb_start + - (int)le32_to_cpu(sb->bblog_offset)); - bb->size = le16_to_cpu(sb->bblog_size); - bb->changed = 0; - } - } - max_dev = 0; list_for_each_entry(rdev2, &mddev->disks, same_set) if (rdev2->desc_nr+1 > max_dev) @@ -1853,7 +1724,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) num_sectors = max_sectors; rdev->sb_start = sb_start; } - sb = page_address(rdev->sb_page); + sb = (struct mdp_superblock_1 *) page_address(rdev->sb_page); sb->data_size = cpu_to_le64(num_sectors); sb->super_offset = rdev->sb_start; sb->sb_csum = calc_sb_1_csum(sb); @@ -2051,7 +1922,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) bd_link_disk_holder(rdev->bdev, mddev->gendisk); /* May as well allow recovery to be retried once */ - mddev->recovery_disabled++; + mddev->recovery_disabled = 0; return 0; @@ -2082,9 +1953,6 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev) sysfs_remove_link(&rdev->kobj, "block"); sysfs_put(rdev->sysfs_state); rdev->sysfs_state = NULL; - kfree(rdev->badblocks.page); - rdev->badblocks.count = 0; - rdev->badblocks.page = NULL; /* We need to delay this, otherwise we can deadlock when * writing to 'remove' to "dev/state". We also need * to delay it due to rcu usage. @@ -2259,10 +2127,10 @@ static void print_rdev(mdk_rdev_t *rdev, int major_version) printk(KERN_INFO "md: rdev superblock (MJ:%d):\n", major_version); switch (major_version) { case 0: - print_sb_90(page_address(rdev->sb_page)); + print_sb_90((mdp_super_t*)page_address(rdev->sb_page)); break; case 1: - print_sb_1(page_address(rdev->sb_page)); + print_sb_1((struct mdp_superblock_1 *)page_address(rdev->sb_page)); break; } } else @@ -2326,7 +2194,6 @@ static void md_update_sb(mddev_t * mddev, int force_change) mdk_rdev_t *rdev; int sync_req; int nospares = 0; - int any_badblocks_changed = 0; repeat: /* First make sure individual recovery_offsets are correct */ @@ -2341,18 +2208,8 @@ static void md_update_sb(mddev_t * mddev, int force_change) if (!mddev->persistent) { clear_bit(MD_CHANGE_CLEAN, &mddev->flags); clear_bit(MD_CHANGE_DEVS, &mddev->flags); - if (!mddev->external) { + if (!mddev->external) clear_bit(MD_CHANGE_PENDING, &mddev->flags); - list_for_each_entry(rdev, &mddev->disks, same_set) { - if (rdev->badblocks.changed) { - md_ack_all_badblocks(&rdev->badblocks); - md_error(mddev, rdev); - } - clear_bit(Blocked, &rdev->flags); - clear_bit(BlockedBadBlocks, &rdev->flags); - wake_up(&rdev->blocked_wait); - } - } wake_up(&mddev->sb_wait); return; } @@ -2408,14 +2265,6 @@ static void md_update_sb(mddev_t * mddev, int force_change) MD_BUG(); mddev->events --; } - - list_for_each_entry(rdev, &mddev->disks, same_set) { - if (rdev->badblocks.changed) - any_badblocks_changed++; - if (test_bit(Faulty, &rdev->flags)) - set_bit(FaultRecorded, &rdev->flags); - } - sync_sbs(mddev, nospares); spin_unlock_irq(&mddev->write_lock); @@ -2441,13 +2290,6 @@ static void md_update_sb(mddev_t * mddev, int force_change) bdevname(rdev->bdev,b), (unsigned long long)rdev->sb_start); rdev->sb_events = mddev->events; - if (rdev->badblocks.size) { - md_super_write(mddev, rdev, - rdev->badblocks.sector, - rdev->badblocks.size << 9, - rdev->bb_page); - rdev->badblocks.size = 0; - } } else dprintk(")\n"); @@ -2471,15 +2313,6 @@ static void md_update_sb(mddev_t * mddev, int force_change) if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) sysfs_notify(&mddev->kobj, NULL, "sync_completed"); - list_for_each_entry(rdev, &mddev->disks, same_set) { - if (test_and_clear_bit(FaultRecorded, &rdev->flags)) - clear_bit(Blocked, &rdev->flags); - - if (any_badblocks_changed) - md_ack_all_badblocks(&rdev->badblocks); - clear_bit(BlockedBadBlocks, &rdev->flags); - wake_up(&rdev->blocked_wait); - } } /* words written to sysfs files may, or may not, be \n terminated. @@ -2514,8 +2347,7 @@ state_show(mdk_rdev_t *rdev, char *page) char *sep = ""; size_t len = 0; - if (test_bit(Faulty, &rdev->flags) || - rdev->badblocks.unacked_exist) { + if (test_bit(Faulty, &rdev->flags)) { len+= sprintf(page+len, "%sfaulty",sep); sep = ","; } @@ -2527,8 +2359,7 @@ state_show(mdk_rdev_t *rdev, char *page) len += sprintf(page+len, "%swrite_mostly",sep); sep = ","; } - if (test_bit(Blocked, &rdev->flags) || - rdev->badblocks.unacked_exist) { + if (test_bit(Blocked, &rdev->flags)) { len += sprintf(page+len, "%sblocked", sep); sep = ","; } @@ -2537,10 +2368,6 @@ state_show(mdk_rdev_t *rdev, char *page) len += sprintf(page+len, "%sspare", sep); sep = ","; } - if (test_bit(WriteErrorSeen, &rdev->flags)) { - len += sprintf(page+len, "%swrite_error", sep); - sep = ","; - } return len+sprintf(page+len, "\n"); } @@ -2548,15 +2375,13 @@ static ssize_t state_store(mdk_rdev_t *rdev, const char *buf, size_t len) { /* can write - * faulty - simulates an error + * faulty - simulates and error * remove - disconnects the device * writemostly - sets write_mostly * -writemostly - clears write_mostly - * blocked - sets the Blocked flags - * -blocked - clears the Blocked and possibly simulates an error + * blocked - sets the Blocked flag + * -blocked - clears the Blocked flag * insync - sets Insync providing device isn't active - * write_error - sets WriteErrorSeen - * -write_error - clears WriteErrorSeen */ int err = -EINVAL; if (cmd_match(buf, "faulty") && rdev->mddev->pers) { @@ -2583,15 +2408,7 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len) set_bit(Blocked, &rdev->flags); err = 0; } else if (cmd_match(buf, "-blocked")) { - if (!test_bit(Faulty, &rdev->flags) && - test_bit(BlockedBadBlocks, &rdev->flags)) { - /* metadata handler doesn't understand badblocks, - * so we need to fail the device - */ - md_error(rdev->mddev, rdev); - } clear_bit(Blocked, &rdev->flags); - clear_bit(BlockedBadBlocks, &rdev->flags); wake_up(&rdev->blocked_wait); set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); md_wakeup_thread(rdev->mddev->thread); @@ -2600,12 +2417,6 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len) } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { set_bit(In_sync, &rdev->flags); err = 0; - } else if (cmd_match(buf, "write_error")) { - set_bit(WriteErrorSeen, &rdev->flags); - err = 0; - } else if (cmd_match(buf, "-write_error")) { - clear_bit(WriteErrorSeen, &rdev->flags); - err = 0; } if (!err) sysfs_notify_dirent_safe(rdev->sysfs_state); @@ -2648,6 +2459,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) { char *e; int err; + char nm[20]; int slot = simple_strtoul(buf, &e, 10); if (strncmp(buf, "none", 4)==0) slot = -1; @@ -2670,7 +2482,8 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) hot_remove_disk(rdev->mddev, rdev->raid_disk); if (err) return err; - sysfs_unlink_rdev(rdev->mddev, rdev); + sprintf(nm, "rd%d", rdev->raid_disk); + sysfs_remove_link(&rdev->mddev->kobj, nm); rdev->raid_disk = -1; set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); md_wakeup_thread(rdev->mddev->thread); @@ -2709,7 +2522,8 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) return err; } else sysfs_notify_dirent_safe(rdev->sysfs_state); - if (sysfs_link_rdev(rdev->mddev, rdev)) + sprintf(nm, "rd%d", rdev->raid_disk); + if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm)) /* failure here is OK */; /* don't wakeup anyone, leave that to userspace. */ } else { @@ -2898,39 +2712,6 @@ static ssize_t recovery_start_store(mdk_rdev_t *rdev, const char *buf, size_t le static struct rdev_sysfs_entry rdev_recovery_start = __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); - -static ssize_t -badblocks_show(struct badblocks *bb, char *page, int unack); -static ssize_t -badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack); - -static ssize_t bb_show(mdk_rdev_t *rdev, char *page) -{ - return badblocks_show(&rdev->badblocks, page, 0); -} -static ssize_t bb_store(mdk_rdev_t *rdev, const char *page, size_t len) -{ - int rv = badblocks_store(&rdev->badblocks, page, len, 0); - /* Maybe that ack was all we needed */ - if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags)) - wake_up(&rdev->blocked_wait); - return rv; -} -static struct rdev_sysfs_entry rdev_bad_blocks = -__ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store); - - -static ssize_t ubb_show(mdk_rdev_t *rdev, char *page) -{ - return badblocks_show(&rdev->badblocks, page, 1); -} -static ssize_t ubb_store(mdk_rdev_t *rdev, const char *page, size_t len) -{ - return badblocks_store(&rdev->badblocks, page, len, 1); -} -static struct rdev_sysfs_entry rdev_unack_bad_blocks = -__ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store); - static struct attribute *rdev_default_attrs[] = { &rdev_state.attr, &rdev_errors.attr, @@ -2938,8 +2719,6 @@ static struct attribute *rdev_default_attrs[] = { &rdev_offset.attr, &rdev_size.attr, &rdev_recovery_start.attr, - &rdev_bad_blocks.attr, - &rdev_unack_bad_blocks.attr, NULL, }; static ssize_t @@ -3003,7 +2782,7 @@ static struct kobj_type rdev_ktype = { .default_attrs = rdev_default_attrs, }; -int md_rdev_init(mdk_rdev_t *rdev) +void md_rdev_init(mdk_rdev_t *rdev) { rdev->desc_nr = -1; rdev->saved_raid_disk = -1; @@ -3013,27 +2792,12 @@ int md_rdev_init(mdk_rdev_t *rdev) rdev->sb_events = 0; rdev->last_read_error.tv_sec = 0; rdev->last_read_error.tv_nsec = 0; - rdev->sb_loaded = 0; - rdev->bb_page = NULL; atomic_set(&rdev->nr_pending, 0); atomic_set(&rdev->read_errors, 0); atomic_set(&rdev->corrected_errors, 0); INIT_LIST_HEAD(&rdev->same_set); init_waitqueue_head(&rdev->blocked_wait); - - /* Add space to store bad block list. - * This reserves the space even on arrays where it cannot - * be used - I wonder if that matters - */ - rdev->badblocks.count = 0; - rdev->badblocks.shift = 0; - rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL); - seqlock_init(&rdev->badblocks.lock); - if (rdev->badblocks.page == NULL) - return -ENOMEM; - - return 0; } EXPORT_SYMBOL_GPL(md_rdev_init); /* @@ -3059,11 +2823,8 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi return ERR_PTR(-ENOMEM); } - err = md_rdev_init(rdev); - if (err) - goto abort_free; - err = alloc_disk_sb(rdev); - if (err) + md_rdev_init(rdev); + if ((err = alloc_disk_sb(rdev))) goto abort_free; err = lock_rdev(rdev, newdev, super_format == -2); @@ -3099,17 +2860,15 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi goto abort_free; } } - if (super_format == -1) - /* hot-add for 0.90, or non-persistent: so no badblocks */ - rdev->badblocks.shift = -1; return rdev; abort_free: - if (rdev->bdev) - unlock_rdev(rdev); - free_disk_sb(rdev); - kfree(rdev->badblocks.page); + if (rdev->sb_page) { + if (rdev->bdev) + unlock_rdev(rdev); + free_disk_sb(rdev); + } kfree(rdev); return ERR_PTR(err); } @@ -3390,13 +3149,15 @@ level_store(mddev_t *mddev, const char *buf, size_t len) } list_for_each_entry(rdev, &mddev->disks, same_set) { + char nm[20]; if (rdev->raid_disk < 0) continue; if (rdev->new_raid_disk >= mddev->raid_disks) rdev->new_raid_disk = -1; if (rdev->new_raid_disk == rdev->raid_disk) continue; - sysfs_unlink_rdev(mddev, rdev); + sprintf(nm, "rd%d", rdev->raid_disk); + sysfs_remove_link(&mddev->kobj, nm); } list_for_each_entry(rdev, &mddev->disks, same_set) { if (rdev->raid_disk < 0) @@ -3407,10 +3168,11 @@ level_store(mddev_t *mddev, const char *buf, size_t len) if (rdev->raid_disk < 0) clear_bit(In_sync, &rdev->flags); else { - if (sysfs_link_rdev(mddev, rdev)) - printk(KERN_WARNING "md: cannot register rd%d" - " for %s after level change\n", - rdev->raid_disk, mdname(mddev)); + char nm[20]; + sprintf(nm, "rd%d", rdev->raid_disk); + if(sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) + printk("md: cannot register %s for %s after level change\n", + nm, mdname(mddev)); } } @@ -4742,8 +4504,7 @@ int md_run(mddev_t *mddev) } if (mddev->bio_set == NULL) - mddev->bio_set = bioset_create(BIO_POOL_SIZE, - sizeof(mddev_t *)); + mddev->bio_set = bioset_create(BIO_POOL_SIZE, sizeof(mddev)); spin_lock(&pers_lock); pers = find_pers(mddev->level, mddev->clevel); @@ -4860,9 +4621,12 @@ int md_run(mddev_t *mddev) smp_wmb(); mddev->ready = 1; list_for_each_entry(rdev, &mddev->disks, same_set) - if (rdev->raid_disk >= 0) - if (sysfs_link_rdev(mddev, rdev)) + if (rdev->raid_disk >= 0) { + char nm[20]; + sprintf(nm, "rd%d", rdev->raid_disk); + if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) /* failure here is OK */; + } set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); @@ -5090,8 +4854,11 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) sysfs_notify_dirent_safe(mddev->sysfs_state); list_for_each_entry(rdev, &mddev->disks, same_set) - if (rdev->raid_disk >= 0) - sysfs_unlink_rdev(mddev, rdev); + if (rdev->raid_disk >= 0) { + char nm[20]; + sprintf(nm, "rd%d", rdev->raid_disk); + sysfs_remove_link(&mddev->kobj, nm); + } set_capacity(disk, 0); mutex_unlock(&mddev->open_mutex); @@ -6431,7 +6198,18 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev) if (!rdev || test_bit(Faulty, &rdev->flags)) return; - if (!mddev->pers || !mddev->pers->error_handler) + if (mddev->external) + set_bit(Blocked, &rdev->flags); +/* + dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", + mdname(mddev), + MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev), + __builtin_return_address(0),__builtin_return_address(1), + __builtin_return_address(2),__builtin_return_address(3)); +*/ + if (!mddev->pers) + return; + if (!mddev->pers->error_handler) return; mddev->pers->error_handler(mddev,rdev); if (mddev->degraded) @@ -7155,14 +6933,11 @@ void md_do_sync(mddev_t *mddev) atomic_add(sectors, &mddev->recovery_active); } - if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) - break; - j += sectors; if (j>1) mddev->curr_resync = j; mddev->curr_mark_cnt = io_sectors; if (last_check == 0) - /* this is the earliest that rebuild will be + /* this is the earliers that rebuilt will be * visible in /proc/mdstat */ md_new_event(mddev); @@ -7171,6 +6946,10 @@ void md_do_sync(mddev_t *mddev) continue; last_check = io_sectors; + + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) + break; + repeat: if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { /* step marks */ @@ -7288,23 +7067,29 @@ static int remove_and_add_spares(mddev_t *mddev) atomic_read(&rdev->nr_pending)==0) { if (mddev->pers->hot_remove_disk( mddev, rdev->raid_disk)==0) { - sysfs_unlink_rdev(mddev, rdev); + char nm[20]; + sprintf(nm,"rd%d", rdev->raid_disk); + sysfs_remove_link(&mddev->kobj, nm); rdev->raid_disk = -1; } } - if (mddev->degraded) { + if (mddev->degraded && !mddev->recovery_disabled) { list_for_each_entry(rdev, &mddev->disks, same_set) { if (rdev->raid_disk >= 0 && !test_bit(In_sync, &rdev->flags) && - !test_bit(Faulty, &rdev->flags)) + !test_bit(Faulty, &rdev->flags) && + !test_bit(Blocked, &rdev->flags)) spares++; if (rdev->raid_disk < 0 && !test_bit(Faulty, &rdev->flags)) { rdev->recovery_offset = 0; if (mddev->pers-> hot_add_disk(mddev, rdev) == 0) { - if (sysfs_link_rdev(mddev, rdev)) + char nm[20]; + sprintf(nm, "rd%d", rdev->raid_disk); + if (sysfs_create_link(&mddev->kobj, + &rdev->kobj, nm)) /* failure here is OK */; spares++; md_new_event(mddev); @@ -7353,8 +7138,6 @@ static void reap_sync_thread(mddev_t *mddev) set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); sysfs_notify_dirent_safe(mddev->sysfs_action); md_new_event(mddev); - if (mddev->event_work.func) - queue_work(md_misc_wq, &mddev->event_work); } /* @@ -7387,6 +7170,9 @@ void md_check_recovery(mddev_t *mddev) if (mddev->bitmap) bitmap_daemon_work(mddev); + if (mddev->ro) + return; + if (signal_pending(current)) { if (mddev->pers->sync_request && !mddev->external) { printk(KERN_INFO "md: %s in immediate safe mode\n", @@ -7423,7 +7209,9 @@ void md_check_recovery(mddev_t *mddev) atomic_read(&rdev->nr_pending)==0) { if (mddev->pers->hot_remove_disk( mddev, rdev->raid_disk)==0) { - sysfs_unlink_rdev(mddev, rdev); + char nm[20]; + sprintf(nm,"rd%d", rdev->raid_disk); + sysfs_remove_link(&mddev->kobj, nm); rdev->raid_disk = -1; } } @@ -7543,499 +7331,12 @@ void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev) { sysfs_notify_dirent_safe(rdev->sysfs_state); wait_event_timeout(rdev->blocked_wait, - !test_bit(Blocked, &rdev->flags) && - !test_bit(BlockedBadBlocks, &rdev->flags), + !test_bit(Blocked, &rdev->flags), msecs_to_jiffies(5000)); rdev_dec_pending(rdev, mddev); } EXPORT_SYMBOL(md_wait_for_blocked_rdev); - -/* Bad block management. - * We can record which blocks on each device are 'bad' and so just - * fail those blocks, or that stripe, rather than the whole device. - * Entries in the bad-block table are 64bits wide. This comprises: - * Length of bad-range, in sectors: 0-511 for lengths 1-512 - * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes) - * A 'shift' can be set so that larger blocks are tracked and - * consequently larger devices can be covered. - * 'Acknowledged' flag - 1 bit. - the most significant bit. - * - * Locking of the bad-block table uses a seqlock so md_is_badblock - * might need to retry if it is very unlucky. - * We will sometimes want to check for bad blocks in a bi_end_io function, - * so we use the write_seqlock_irq variant. - * - * When looking for a bad block we specify a range and want to - * know if any block in the range is bad. So we binary-search - * to the last range that starts at-or-before the given endpoint, - * (or "before the sector after the target range") - * then see if it ends after the given start. - * We return - * 0 if there are no known bad blocks in the range - * 1 if there are known bad block which are all acknowledged - * -1 if there are bad blocks which have not yet been acknowledged in metadata. - * plus the start/length of the first bad section we overlap. - */ -int md_is_badblock(struct badblocks *bb, sector_t s, int sectors, - sector_t *first_bad, int *bad_sectors) -{ - int hi; - int lo = 0; - u64 *p = bb->page; - int rv = 0; - sector_t target = s + sectors; - unsigned seq; - - if (bb->shift > 0) { - /* round the start down, and the end up */ - s >>= bb->shift; - target += (1<shift) - 1; - target >>= bb->shift; - sectors = target - s; - } - /* 'target' is now the first block after the bad range */ - -retry: - seq = read_seqbegin(&bb->lock); - - hi = bb->count; - - /* Binary search between lo and hi for 'target' - * i.e. for the last range that starts before 'target' - */ - /* INVARIANT: ranges before 'lo' and at-or-after 'hi' - * are known not to be the last range before target. - * VARIANT: hi-lo is the number of possible - * ranges, and decreases until it reaches 1 - */ - while (hi - lo > 1) { - int mid = (lo + hi) / 2; - sector_t a = BB_OFFSET(p[mid]); - if (a < target) - /* This could still be the one, earlier ranges - * could not. */ - lo = mid; - else - /* This and later ranges are definitely out. */ - hi = mid; - } - /* 'lo' might be the last that started before target, but 'hi' isn't */ - if (hi > lo) { - /* need to check all range that end after 's' to see if - * any are unacknowledged. - */ - while (lo >= 0 && - BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { - if (BB_OFFSET(p[lo]) < target) { - /* starts before the end, and finishes after - * the start, so they must overlap - */ - if (rv != -1 && BB_ACK(p[lo])) - rv = 1; - else - rv = -1; - *first_bad = BB_OFFSET(p[lo]); - *bad_sectors = BB_LEN(p[lo]); - } - lo--; - } - } - - if (read_seqretry(&bb->lock, seq)) - goto retry; - - return rv; -} -EXPORT_SYMBOL_GPL(md_is_badblock); - -/* - * Add a range of bad blocks to the table. - * This might extend the table, or might contract it - * if two adjacent ranges can be merged. - * We binary-search to find the 'insertion' point, then - * decide how best to handle it. - */ -static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, - int acknowledged) -{ - u64 *p; - int lo, hi; - int rv = 1; - - if (bb->shift < 0) - /* badblocks are disabled */ - return 0; - - if (bb->shift) { - /* round the start down, and the end up */ - sector_t next = s + sectors; - s >>= bb->shift; - next += (1<shift) - 1; - next >>= bb->shift; - sectors = next - s; - } - - write_seqlock_irq(&bb->lock); - - p = bb->page; - lo = 0; - hi = bb->count; - /* Find the last range that starts at-or-before 's' */ - while (hi - lo > 1) { - int mid = (lo + hi) / 2; - sector_t a = BB_OFFSET(p[mid]); - if (a <= s) - lo = mid; - else - hi = mid; - } - if (hi > lo && BB_OFFSET(p[lo]) > s) - hi = lo; - - if (hi > lo) { - /* we found a range that might merge with the start - * of our new range - */ - sector_t a = BB_OFFSET(p[lo]); - sector_t e = a + BB_LEN(p[lo]); - int ack = BB_ACK(p[lo]); - if (e >= s) { - /* Yes, we can merge with a previous range */ - if (s == a && s + sectors >= e) - /* new range covers old */ - ack = acknowledged; - else - ack = ack && acknowledged; - - if (e < s + sectors) - e = s + sectors; - if (e - a <= BB_MAX_LEN) { - p[lo] = BB_MAKE(a, e-a, ack); - s = e; - } else { - /* does not all fit in one range, - * make p[lo] maximal - */ - if (BB_LEN(p[lo]) != BB_MAX_LEN) - p[lo] = BB_MAKE(a, BB_MAX_LEN, ack); - s = a + BB_MAX_LEN; - } - sectors = e - s; - } - } - if (sectors && hi < bb->count) { - /* 'hi' points to the first range that starts after 's'. - * Maybe we can merge with the start of that range */ - sector_t a = BB_OFFSET(p[hi]); - sector_t e = a + BB_LEN(p[hi]); - int ack = BB_ACK(p[hi]); - if (a <= s + sectors) { - /* merging is possible */ - if (e <= s + sectors) { - /* full overlap */ - e = s + sectors; - ack = acknowledged; - } else - ack = ack && acknowledged; - - a = s; - if (e - a <= BB_MAX_LEN) { - p[hi] = BB_MAKE(a, e-a, ack); - s = e; - } else { - p[hi] = BB_MAKE(a, BB_MAX_LEN, ack); - s = a + BB_MAX_LEN; - } - sectors = e - s; - lo = hi; - hi++; - } - } - if (sectors == 0 && hi < bb->count) { - /* we might be able to combine lo and hi */ - /* Note: 's' is at the end of 'lo' */ - sector_t a = BB_OFFSET(p[hi]); - int lolen = BB_LEN(p[lo]); - int hilen = BB_LEN(p[hi]); - int newlen = lolen + hilen - (s - a); - if (s >= a && newlen < BB_MAX_LEN) { - /* yes, we can combine them */ - int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]); - p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack); - memmove(p + hi, p + hi + 1, - (bb->count - hi - 1) * 8); - bb->count--; - } - } - while (sectors) { - /* didn't merge (it all). - * Need to add a range just before 'hi' */ - if (bb->count >= MD_MAX_BADBLOCKS) { - /* No room for more */ - rv = 0; - break; - } else { - int this_sectors = sectors; - memmove(p + hi + 1, p + hi, - (bb->count - hi) * 8); - bb->count++; - - if (this_sectors > BB_MAX_LEN) - this_sectors = BB_MAX_LEN; - p[hi] = BB_MAKE(s, this_sectors, acknowledged); - sectors -= this_sectors; - s += this_sectors; - } - } - - bb->changed = 1; - if (!acknowledged) - bb->unacked_exist = 1; - write_sequnlock_irq(&bb->lock); - - return rv; -} - -int rdev_set_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors, - int acknowledged) -{ - int rv = md_set_badblocks(&rdev->badblocks, - s + rdev->data_offset, sectors, acknowledged); - if (rv) { - /* Make sure they get written out promptly */ - set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags); - md_wakeup_thread(rdev->mddev->thread); - } - return rv; -} -EXPORT_SYMBOL_GPL(rdev_set_badblocks); - -/* - * Remove a range of bad blocks from the table. - * This may involve extending the table if we spilt a region, - * but it must not fail. So if the table becomes full, we just - * drop the remove request. - */ -static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors) -{ - u64 *p; - int lo, hi; - sector_t target = s + sectors; - int rv = 0; - - if (bb->shift > 0) { - /* When clearing we round the start up and the end down. - * This should not matter as the shift should align with - * the block size and no rounding should ever be needed. - * However it is better the think a block is bad when it - * isn't than to think a block is not bad when it is. - */ - s += (1<shift) - 1; - s >>= bb->shift; - target >>= bb->shift; - sectors = target - s; - } - - write_seqlock_irq(&bb->lock); - - p = bb->page; - lo = 0; - hi = bb->count; - /* Find the last range that starts before 'target' */ - while (hi - lo > 1) { - int mid = (lo + hi) / 2; - sector_t a = BB_OFFSET(p[mid]); - if (a < target) - lo = mid; - else - hi = mid; - } - if (hi > lo) { - /* p[lo] is the last range that could overlap the - * current range. Earlier ranges could also overlap, - * but only this one can overlap the end of the range. - */ - if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) { - /* Partial overlap, leave the tail of this range */ - int ack = BB_ACK(p[lo]); - sector_t a = BB_OFFSET(p[lo]); - sector_t end = a + BB_LEN(p[lo]); - - if (a < s) { - /* we need to split this range */ - if (bb->count >= MD_MAX_BADBLOCKS) { - rv = 0; - goto out; - } - memmove(p+lo+1, p+lo, (bb->count - lo) * 8); - bb->count++; - p[lo] = BB_MAKE(a, s-a, ack); - lo++; - } - p[lo] = BB_MAKE(target, end - target, ack); - /* there is no longer an overlap */ - hi = lo; - lo--; - } - while (lo >= 0 && - BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { - /* This range does overlap */ - if (BB_OFFSET(p[lo]) < s) { - /* Keep the early parts of this range. */ - int ack = BB_ACK(p[lo]); - sector_t start = BB_OFFSET(p[lo]); - p[lo] = BB_MAKE(start, s - start, ack); - /* now low doesn't overlap, so.. */ - break; - } - lo--; - } - /* 'lo' is strictly before, 'hi' is strictly after, - * anything between needs to be discarded - */ - if (hi - lo > 1) { - memmove(p+lo+1, p+hi, (bb->count - hi) * 8); - bb->count -= (hi - lo - 1); - } - } - - bb->changed = 1; -out: - write_sequnlock_irq(&bb->lock); - return rv; -} - -int rdev_clear_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors) -{ - return md_clear_badblocks(&rdev->badblocks, - s + rdev->data_offset, - sectors); -} -EXPORT_SYMBOL_GPL(rdev_clear_badblocks); - -/* - * Acknowledge all bad blocks in a list. - * This only succeeds if ->changed is clear. It is used by - * in-kernel metadata updates - */ -void md_ack_all_badblocks(struct badblocks *bb) -{ - if (bb->page == NULL || bb->changed) - /* no point even trying */ - return; - write_seqlock_irq(&bb->lock); - - if (bb->changed == 0) { - u64 *p = bb->page; - int i; - for (i = 0; i < bb->count ; i++) { - if (!BB_ACK(p[i])) { - sector_t start = BB_OFFSET(p[i]); - int len = BB_LEN(p[i]); - p[i] = BB_MAKE(start, len, 1); - } - } - bb->unacked_exist = 0; - } - write_sequnlock_irq(&bb->lock); -} -EXPORT_SYMBOL_GPL(md_ack_all_badblocks); - -/* sysfs access to bad-blocks list. - * We present two files. - * 'bad-blocks' lists sector numbers and lengths of ranges that - * are recorded as bad. The list is truncated to fit within - * the one-page limit of sysfs. - * Writing "sector length" to this file adds an acknowledged - * bad block list. - * 'unacknowledged-bad-blocks' lists bad blocks that have not yet - * been acknowledged. Writing to this file adds bad blocks - * without acknowledging them. This is largely for testing. - */ - -static ssize_t -badblocks_show(struct badblocks *bb, char *page, int unack) -{ - size_t len; - int i; - u64 *p = bb->page; - unsigned seq; - - if (bb->shift < 0) - return 0; - -retry: - seq = read_seqbegin(&bb->lock); - - len = 0; - i = 0; - - while (len < PAGE_SIZE && i < bb->count) { - sector_t s = BB_OFFSET(p[i]); - unsigned int length = BB_LEN(p[i]); - int ack = BB_ACK(p[i]); - i++; - - if (unack && ack) - continue; - - len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n", - (unsigned long long)s << bb->shift, - length << bb->shift); - } - if (unack && len == 0) - bb->unacked_exist = 0; - - if (read_seqretry(&bb->lock, seq)) - goto retry; - - return len; -} - -#define DO_DEBUG 1 - -static ssize_t -badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack) -{ - unsigned long long sector; - int length; - char newline; -#ifdef DO_DEBUG - /* Allow clearing via sysfs *only* for testing/debugging. - * Normally only a successful write may clear a badblock - */ - int clear = 0; - if (page[0] == '-') { - clear = 1; - page++; - } -#endif /* DO_DEBUG */ - - switch (sscanf(page, "%llu %d%c", §or, &length, &newline)) { - case 3: - if (newline != '\n') - return -EINVAL; - case 2: - if (length <= 0) - return -EINVAL; - break; - default: - return -EINVAL; - } - -#ifdef DO_DEBUG - if (clear) { - md_clear_badblocks(bb, sector, length); - return len; - } -#endif /* DO_DEBUG */ - if (md_set_badblocks(bb, sector, length, !unack)) - return len; - else - return -ENOSPC; -} - static int md_notify_reboot(struct notifier_block *this, unsigned long code, void *x) { diff --git a/trunk/drivers/md/md.h b/trunk/drivers/md/md.h index 1e586bb4452e..1c26c7a08ae6 100644 --- a/trunk/drivers/md/md.h +++ b/trunk/drivers/md/md.h @@ -29,13 +29,6 @@ typedef struct mddev_s mddev_t; typedef struct mdk_rdev_s mdk_rdev_t; -/* Bad block numbers are stored sorted in a single page. - * 64bits is used for each block or extent. - * 54 bits are sector number, 9 bits are extent size, - * 1 bit is an 'acknowledged' flag. - */ -#define MD_MAX_BADBLOCKS (PAGE_SIZE/8) - /* * MD's 'extended' device */ @@ -55,7 +48,7 @@ struct mdk_rdev_s struct block_device *meta_bdev; struct block_device *bdev; /* block device handle */ - struct page *sb_page, *bb_page; + struct page *sb_page; int sb_loaded; __u64 sb_events; sector_t data_offset; /* start of data in array */ @@ -81,29 +74,9 @@ struct mdk_rdev_s #define In_sync 2 /* device is in_sync with rest of array */ #define WriteMostly 4 /* Avoid reading if at all possible */ #define AutoDetected 7 /* added by auto-detect */ -#define Blocked 8 /* An error occurred but has not yet - * been acknowledged by the metadata - * handler, so don't allow writes +#define Blocked 8 /* An error occurred on an externally + * managed array, don't allow writes * until it is cleared */ -#define WriteErrorSeen 9 /* A write error has been seen on this - * device - */ -#define FaultRecorded 10 /* Intermediate state for clearing - * Blocked. The Fault is/will-be - * recorded in the metadata, but that - * metadata hasn't been stored safely - * on disk yet. - */ -#define BlockedBadBlocks 11 /* A writer is blocked because they - * found an unacknowledged bad-block. - * This can safely be cleared at any - * time, and the writer will re-check. - * It may be set at any time, and at - * worst the writer will timeout and - * re-check. So setting it as - * accurately as possible is good, but - * not absolutely critical. - */ wait_queue_head_t blocked_wait; int desc_nr; /* descriptor index in the superblock */ @@ -138,54 +111,8 @@ struct mdk_rdev_s struct sysfs_dirent *sysfs_state; /* handle for 'state' * sysfs entry */ - - struct badblocks { - int count; /* count of bad blocks */ - int unacked_exist; /* there probably are unacknowledged - * bad blocks. This is only cleared - * when a read discovers none - */ - int shift; /* shift from sectors to block size - * a -ve shift means badblocks are - * disabled.*/ - u64 *page; /* badblock list */ - int changed; - seqlock_t lock; - - sector_t sector; - sector_t size; /* in sectors */ - } badblocks; }; -#define BB_LEN_MASK (0x00000000000001FFULL) -#define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL) -#define BB_ACK_MASK (0x8000000000000000ULL) -#define BB_MAX_LEN 512 -#define BB_OFFSET(x) (((x) & BB_OFFSET_MASK) >> 9) -#define BB_LEN(x) (((x) & BB_LEN_MASK) + 1) -#define BB_ACK(x) (!!((x) & BB_ACK_MASK)) -#define BB_MAKE(a, l, ack) (((a)<<9) | ((l)-1) | ((u64)(!!(ack)) << 63)) - -extern int md_is_badblock(struct badblocks *bb, sector_t s, int sectors, - sector_t *first_bad, int *bad_sectors); -static inline int is_badblock(mdk_rdev_t *rdev, sector_t s, int sectors, - sector_t *first_bad, int *bad_sectors) -{ - if (unlikely(rdev->badblocks.count)) { - int rv = md_is_badblock(&rdev->badblocks, rdev->data_offset + s, - sectors, - first_bad, bad_sectors); - if (rv) - *first_bad -= rdev->data_offset; - return rv; - } - return 0; -} -extern int rdev_set_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors, - int acknowledged); -extern int rdev_clear_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors); -extern void md_ack_all_badblocks(struct badblocks *bb); - struct mddev_s { void *private; @@ -312,12 +239,9 @@ struct mddev_s #define MD_RECOVERY_FROZEN 9 unsigned long recovery; - /* If a RAID personality determines that recovery (of a particular - * device) will fail due to a read error on the source device, it - * takes a copy of this number and does not attempt recovery again - * until this number changes. - */ - int recovery_disabled; + int recovery_disabled; /* if we detect that recovery + * will always fail, set this + * so we don't loop trying */ int in_sync; /* know to not need resync */ /* 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so @@ -380,6 +304,11 @@ struct mddev_s * hot-adding a bitmap. It should * eventually be settable by sysfs. */ + /* When md is serving under dm, it might use a + * dirty_log to store the bits. + */ + struct dm_dirty_log *log; + struct mutex mutex; unsigned long chunksize; unsigned long daemon_sleep; /* how many jiffies between updates? */ @@ -484,20 +413,6 @@ static inline char * mdname (mddev_t * mddev) return mddev->gendisk ? mddev->gendisk->disk_name : "mdX"; } -static inline int sysfs_link_rdev(mddev_t *mddev, mdk_rdev_t *rdev) -{ - char nm[20]; - sprintf(nm, "rd%d", rdev->raid_disk); - return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); -} - -static inline void sysfs_unlink_rdev(mddev_t *mddev, mdk_rdev_t *rdev) -{ - char nm[20]; - sprintf(nm, "rd%d", rdev->raid_disk); - sysfs_remove_link(&mddev->kobj, nm); -} - /* * iterates through some rdev ringlist. It's safe to remove the * current 'rdev'. Dont touch 'tmp' though. @@ -590,7 +505,7 @@ extern void mddev_init(mddev_t *mddev); extern int md_run(mddev_t *mddev); extern void md_stop(mddev_t *mddev); extern void md_stop_writes(mddev_t *mddev); -extern int md_rdev_init(mdk_rdev_t *rdev); +extern void md_rdev_init(mdk_rdev_t *rdev); extern void mddev_suspend(mddev_t *mddev); extern void mddev_resume(mddev_t *mddev); @@ -599,5 +514,4 @@ extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, mddev_t *mddev); extern int mddev_check_plugged(mddev_t *mddev); -extern void md_trim_bio(struct bio *bio, int offset, int size); #endif /* _MD_MD_H */ diff --git a/trunk/drivers/md/raid1.c b/trunk/drivers/md/raid1.c index 32323f0afd89..f7431b6d8447 100644 --- a/trunk/drivers/md/raid1.c +++ b/trunk/drivers/md/raid1.c @@ -35,13 +35,16 @@ #include #include #include -#include #include "md.h" #include "raid1.h" #include "bitmap.h" #define DEBUG 0 -#define PRINTK(x...) do { if (DEBUG) printk(x); } while (0) +#if DEBUG +#define PRINTK(x...) printk(x) +#else +#define PRINTK(x...) +#endif /* * Number of guaranteed r1bios in case of extreme VM load: @@ -163,7 +166,7 @@ static void put_all_bios(conf_t *conf, r1bio_t *r1_bio) for (i = 0; i < conf->raid_disks; i++) { struct bio **bio = r1_bio->bios + i; - if (!BIO_SPECIAL(*bio)) + if (*bio && *bio != IO_BLOCKED) bio_put(*bio); *bio = NULL; } @@ -173,6 +176,12 @@ static void free_r1bio(r1bio_t *r1_bio) { conf_t *conf = r1_bio->mddev->private; + /* + * Wake up any possible resync thread that waits for the device + * to go idle. + */ + allow_barrier(conf); + put_all_bios(conf, r1_bio); mempool_free(r1_bio, conf->r1bio_pool); } @@ -213,33 +222,6 @@ static void reschedule_retry(r1bio_t *r1_bio) * operation and are ready to return a success/failure code to the buffer * cache layer. */ -static void call_bio_endio(r1bio_t *r1_bio) -{ - struct bio *bio = r1_bio->master_bio; - int done; - conf_t *conf = r1_bio->mddev->private; - - if (bio->bi_phys_segments) { - unsigned long flags; - spin_lock_irqsave(&conf->device_lock, flags); - bio->bi_phys_segments--; - done = (bio->bi_phys_segments == 0); - spin_unlock_irqrestore(&conf->device_lock, flags); - } else - done = 1; - - if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) - clear_bit(BIO_UPTODATE, &bio->bi_flags); - if (done) { - bio_endio(bio, 0); - /* - * Wake up any possible resync thread that waits for the device - * to go idle. - */ - allow_barrier(conf); - } -} - static void raid_end_bio_io(r1bio_t *r1_bio) { struct bio *bio = r1_bio->master_bio; @@ -252,7 +234,8 @@ static void raid_end_bio_io(r1bio_t *r1_bio) (unsigned long long) bio->bi_sector + (bio->bi_size >> 9) - 1); - call_bio_endio(r1_bio); + bio_endio(bio, + test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO); } free_r1bio(r1_bio); } @@ -304,52 +287,36 @@ static void raid1_end_read_request(struct bio *bio, int error) * oops, read error: */ char b[BDEVNAME_SIZE]; - printk_ratelimited( - KERN_ERR "md/raid1:%s: %s: " - "rescheduling sector %llu\n", - mdname(conf->mddev), - bdevname(conf->mirrors[mirror].rdev->bdev, - b), - (unsigned long long)r1_bio->sector); - set_bit(R1BIO_ReadError, &r1_bio->state); + if (printk_ratelimit()) + printk(KERN_ERR "md/raid1:%s: %s: rescheduling sector %llu\n", + mdname(conf->mddev), + bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector); reschedule_retry(r1_bio); } rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); } -static void close_write(r1bio_t *r1_bio) -{ - /* it really is the end of this request */ - if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { - /* free extra copy of the data pages */ - int i = r1_bio->behind_page_count; - while (i--) - safe_put_page(r1_bio->behind_bvecs[i].bv_page); - kfree(r1_bio->behind_bvecs); - r1_bio->behind_bvecs = NULL; - } - /* clear the bitmap if all writes complete successfully */ - bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, - r1_bio->sectors, - !test_bit(R1BIO_Degraded, &r1_bio->state), - test_bit(R1BIO_BehindIO, &r1_bio->state)); - md_write_end(r1_bio->mddev); -} - static void r1_bio_write_done(r1bio_t *r1_bio) { - if (!atomic_dec_and_test(&r1_bio->remaining)) - return; - - if (test_bit(R1BIO_WriteError, &r1_bio->state)) - reschedule_retry(r1_bio); - else { - close_write(r1_bio); - if (test_bit(R1BIO_MadeGood, &r1_bio->state)) - reschedule_retry(r1_bio); - else - raid_end_bio_io(r1_bio); + if (atomic_dec_and_test(&r1_bio->remaining)) + { + /* it really is the end of this request */ + if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { + /* free extra copy of the data pages */ + int i = r1_bio->behind_page_count; + while (i--) + safe_put_page(r1_bio->behind_pages[i]); + kfree(r1_bio->behind_pages); + r1_bio->behind_pages = NULL; + } + /* clear the bitmap if all writes complete successfully */ + bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, + r1_bio->sectors, + !test_bit(R1BIO_Degraded, &r1_bio->state), + test_bit(R1BIO_BehindIO, &r1_bio->state)); + md_write_end(r1_bio->mddev); + raid_end_bio_io(r1_bio); } } @@ -369,11 +336,13 @@ static void raid1_end_write_request(struct bio *bio, int error) /* * 'one mirror IO has finished' event handler: */ + r1_bio->bios[mirror] = NULL; + to_put = bio; if (!uptodate) { - set_bit(WriteErrorSeen, - &conf->mirrors[mirror].rdev->flags); - set_bit(R1BIO_WriteError, &r1_bio->state); - } else { + md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); + /* an I/O failed, we can't clear the bitmap */ + set_bit(R1BIO_Degraded, &r1_bio->state); + } else /* * Set R1BIO_Uptodate in our master bio, so that we * will return a good error code for to the higher @@ -384,22 +353,8 @@ static void raid1_end_write_request(struct bio *bio, int error) * to user-side. So if something waits for IO, then it * will wait for the 'master' bio. */ - sector_t first_bad; - int bad_sectors; - - r1_bio->bios[mirror] = NULL; - to_put = bio; set_bit(R1BIO_Uptodate, &r1_bio->state); - /* Maybe we can clear some bad blocks. */ - if (is_badblock(conf->mirrors[mirror].rdev, - r1_bio->sector, r1_bio->sectors, - &first_bad, &bad_sectors)) { - r1_bio->bios[mirror] = IO_MADE_GOOD; - set_bit(R1BIO_MadeGood, &r1_bio->state); - } - } - update_head_pos(mirror, r1_bio); if (behind) { @@ -422,13 +377,11 @@ static void raid1_end_write_request(struct bio *bio, int error) (unsigned long long) mbio->bi_sector, (unsigned long long) mbio->bi_sector + (mbio->bi_size >> 9) - 1); - call_bio_endio(r1_bio); + bio_endio(mbio, 0); } } } - if (r1_bio->bios[mirror] == NULL) - rdev_dec_pending(conf->mirrors[mirror].rdev, - conf->mddev); + rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); /* * Let's see if all mirrored write operations have finished @@ -455,11 +408,10 @@ static void raid1_end_write_request(struct bio *bio, int error) * * The rdev for the device selected will have nr_pending incremented. */ -static int read_balance(conf_t *conf, r1bio_t *r1_bio, int *max_sectors) +static int read_balance(conf_t *conf, r1bio_t *r1_bio) { const sector_t this_sector = r1_bio->sector; - int sectors; - int best_good_sectors; + const int sectors = r1_bio->sectors; int start_disk; int best_disk; int i; @@ -474,11 +426,8 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio, int *max_sectors) * We take the first readable disk when above the resync window. */ retry: - sectors = r1_bio->sectors; best_disk = -1; best_dist = MaxSector; - best_good_sectors = 0; - if (conf->mddev->recovery_cp < MaxSector && (this_sector + sectors >= conf->next_resync)) { choose_first = 1; @@ -490,9 +439,6 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio, int *max_sectors) for (i = 0 ; i < conf->raid_disks ; i++) { sector_t dist; - sector_t first_bad; - int bad_sectors; - int disk = start_disk + i; if (disk >= conf->raid_disks) disk -= conf->raid_disks; @@ -515,35 +461,6 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio, int *max_sectors) /* This is a reasonable device to use. It might * even be best. */ - if (is_badblock(rdev, this_sector, sectors, - &first_bad, &bad_sectors)) { - if (best_dist < MaxSector) - /* already have a better device */ - continue; - if (first_bad <= this_sector) { - /* cannot read here. If this is the 'primary' - * device, then we must not read beyond - * bad_sectors from another device.. - */ - bad_sectors -= (this_sector - first_bad); - if (choose_first && sectors > bad_sectors) - sectors = bad_sectors; - if (best_good_sectors > sectors) - best_good_sectors = sectors; - - } else { - sector_t good_sectors = first_bad - this_sector; - if (good_sectors > best_good_sectors) { - best_good_sectors = good_sectors; - best_disk = disk; - } - if (choose_first) - break; - } - continue; - } else - best_good_sectors = sectors; - dist = abs(this_sector - conf->mirrors[disk].head_position); if (choose_first /* Don't change to another disk for sequential reads */ @@ -572,12 +489,10 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio, int *max_sectors) rdev_dec_pending(rdev, conf->mddev); goto retry; } - sectors = best_good_sectors; conf->next_seq_sect = this_sector + sectors; conf->last_used = best_disk; } rcu_read_unlock(); - *max_sectors = sectors; return best_disk; } @@ -757,31 +672,30 @@ static void alloc_behind_pages(struct bio *bio, r1bio_t *r1_bio) { int i; struct bio_vec *bvec; - struct bio_vec *bvecs = kzalloc(bio->bi_vcnt * sizeof(struct bio_vec), + struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page*), GFP_NOIO); - if (unlikely(!bvecs)) + if (unlikely(!pages)) return; bio_for_each_segment(bvec, bio, i) { - bvecs[i] = *bvec; - bvecs[i].bv_page = alloc_page(GFP_NOIO); - if (unlikely(!bvecs[i].bv_page)) + pages[i] = alloc_page(GFP_NOIO); + if (unlikely(!pages[i])) goto do_sync_io; - memcpy(kmap(bvecs[i].bv_page) + bvec->bv_offset, - kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len); - kunmap(bvecs[i].bv_page); + memcpy(kmap(pages[i]) + bvec->bv_offset, + kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len); + kunmap(pages[i]); kunmap(bvec->bv_page); } - r1_bio->behind_bvecs = bvecs; + r1_bio->behind_pages = pages; r1_bio->behind_page_count = bio->bi_vcnt; set_bit(R1BIO_BehindIO, &r1_bio->state); return; do_sync_io: for (i = 0; i < bio->bi_vcnt; i++) - if (bvecs[i].bv_page) - put_page(bvecs[i].bv_page); - kfree(bvecs); + if (pages[i]) + put_page(pages[i]); + kfree(pages); PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); } @@ -791,7 +705,7 @@ static int make_request(mddev_t *mddev, struct bio * bio) mirror_info_t *mirror; r1bio_t *r1_bio; struct bio *read_bio; - int i, disks; + int i, targets = 0, disks; struct bitmap *bitmap; unsigned long flags; const int rw = bio_data_dir(bio); @@ -799,9 +713,6 @@ static int make_request(mddev_t *mddev, struct bio * bio) const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); mdk_rdev_t *blocked_rdev; int plugged; - int first_clone; - int sectors_handled; - int max_sectors; /* * Register the new request and wait if the reconstruction @@ -848,24 +759,11 @@ static int make_request(mddev_t *mddev, struct bio * bio) r1_bio->mddev = mddev; r1_bio->sector = bio->bi_sector; - /* We might need to issue multiple reads to different - * devices if there are bad blocks around, so we keep - * track of the number of reads in bio->bi_phys_segments. - * If this is 0, there is only one r1_bio and no locking - * will be needed when requests complete. If it is - * non-zero, then it is the number of not-completed requests. - */ - bio->bi_phys_segments = 0; - clear_bit(BIO_SEG_VALID, &bio->bi_flags); - if (rw == READ) { /* * read balancing logic: */ - int rdisk; - -read_again: - rdisk = read_balance(conf, r1_bio, &max_sectors); + int rdisk = read_balance(conf, r1_bio); if (rdisk < 0) { /* couldn't find anywhere to read from */ @@ -886,8 +784,6 @@ static int make_request(mddev_t *mddev, struct bio * bio) r1_bio->read_disk = rdisk; read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); - md_trim_bio(read_bio, r1_bio->sector - bio->bi_sector, - max_sectors); r1_bio->bios[rdisk] = read_bio; @@ -897,52 +793,16 @@ static int make_request(mddev_t *mddev, struct bio * bio) read_bio->bi_rw = READ | do_sync; read_bio->bi_private = r1_bio; - if (max_sectors < r1_bio->sectors) { - /* could not read all from this device, so we will - * need another r1_bio. - */ - - sectors_handled = (r1_bio->sector + max_sectors - - bio->bi_sector); - r1_bio->sectors = max_sectors; - spin_lock_irq(&conf->device_lock); - if (bio->bi_phys_segments == 0) - bio->bi_phys_segments = 2; - else - bio->bi_phys_segments++; - spin_unlock_irq(&conf->device_lock); - /* Cannot call generic_make_request directly - * as that will be queued in __make_request - * and subsequent mempool_alloc might block waiting - * for it. So hand bio over to raid1d. - */ - reschedule_retry(r1_bio); - - r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); - - r1_bio->master_bio = bio; - r1_bio->sectors = (bio->bi_size >> 9) - sectors_handled; - r1_bio->state = 0; - r1_bio->mddev = mddev; - r1_bio->sector = bio->bi_sector + sectors_handled; - goto read_again; - } else - generic_make_request(read_bio); + generic_make_request(read_bio); return 0; } /* * WRITE: */ - /* first select target devices under rcu_lock and + /* first select target devices under spinlock and * inc refcount on their rdev. Record them by setting * bios[x] to bio - * If there are known/acknowledged bad blocks on any device on - * which we have seen a write error, we want to avoid writing those - * blocks. - * This potentially requires several writes to write around - * the bad blocks. Each set of writes gets it's own r1bio - * with a set of bios attached. */ plugged = mddev_check_plugged(mddev); @@ -950,7 +810,6 @@ static int make_request(mddev_t *mddev, struct bio * bio) retry_write: blocked_rdev = NULL; rcu_read_lock(); - max_sectors = r1_bio->sectors; for (i = 0; i < disks; i++) { mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { @@ -958,56 +817,17 @@ static int make_request(mddev_t *mddev, struct bio * bio) blocked_rdev = rdev; break; } - r1_bio->bios[i] = NULL; - if (!rdev || test_bit(Faulty, &rdev->flags)) { - set_bit(R1BIO_Degraded, &r1_bio->state); - continue; - } - - atomic_inc(&rdev->nr_pending); - if (test_bit(WriteErrorSeen, &rdev->flags)) { - sector_t first_bad; - int bad_sectors; - int is_bad; - - is_bad = is_badblock(rdev, r1_bio->sector, - max_sectors, - &first_bad, &bad_sectors); - if (is_bad < 0) { - /* mustn't write here until the bad block is - * acknowledged*/ - set_bit(BlockedBadBlocks, &rdev->flags); - blocked_rdev = rdev; - break; - } - if (is_bad && first_bad <= r1_bio->sector) { - /* Cannot write here at all */ - bad_sectors -= (r1_bio->sector - first_bad); - if (bad_sectors < max_sectors) - /* mustn't write more than bad_sectors - * to other devices yet - */ - max_sectors = bad_sectors; + if (rdev && !test_bit(Faulty, &rdev->flags)) { + atomic_inc(&rdev->nr_pending); + if (test_bit(Faulty, &rdev->flags)) { rdev_dec_pending(rdev, mddev); - /* We don't set R1BIO_Degraded as that - * only applies if the disk is - * missing, so it might be re-added, - * and we want to know to recover this - * chunk. - * In this case the device is here, - * and the fact that this chunk is not - * in-sync is recorded in the bad - * block log - */ - continue; - } - if (is_bad) { - int good_sectors = first_bad - r1_bio->sector; - if (good_sectors < max_sectors) - max_sectors = good_sectors; + r1_bio->bios[i] = NULL; + } else { + r1_bio->bios[i] = bio; + targets++; } - } - r1_bio->bios[i] = bio; + } else + r1_bio->bios[i] = NULL; } rcu_read_unlock(); @@ -1018,57 +838,51 @@ static int make_request(mddev_t *mddev, struct bio * bio) for (j = 0; j < i; j++) if (r1_bio->bios[j]) rdev_dec_pending(conf->mirrors[j].rdev, mddev); - r1_bio->state = 0; + allow_barrier(conf); md_wait_for_blocked_rdev(blocked_rdev, mddev); wait_barrier(conf); goto retry_write; } - if (max_sectors < r1_bio->sectors) { - /* We are splitting this write into multiple parts, so - * we need to prepare for allocating another r1_bio. - */ - r1_bio->sectors = max_sectors; - spin_lock_irq(&conf->device_lock); - if (bio->bi_phys_segments == 0) - bio->bi_phys_segments = 2; - else - bio->bi_phys_segments++; - spin_unlock_irq(&conf->device_lock); + BUG_ON(targets == 0); /* we never fail the last device */ + + if (targets < conf->raid_disks) { + /* array is degraded, we will not clear the bitmap + * on I/O completion (see raid1_end_write_request) */ + set_bit(R1BIO_Degraded, &r1_bio->state); } - sectors_handled = r1_bio->sector + max_sectors - bio->bi_sector; + + /* do behind I/O ? + * Not if there are too many, or cannot allocate memory, + * or a reader on WriteMostly is waiting for behind writes + * to flush */ + if (bitmap && + (atomic_read(&bitmap->behind_writes) + < mddev->bitmap_info.max_write_behind) && + !waitqueue_active(&bitmap->behind_wait)) + alloc_behind_pages(bio, r1_bio); atomic_set(&r1_bio->remaining, 1); atomic_set(&r1_bio->behind_remaining, 0); - first_clone = 1; + bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors, + test_bit(R1BIO_BehindIO, &r1_bio->state)); for (i = 0; i < disks; i++) { struct bio *mbio; if (!r1_bio->bios[i]) continue; mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); - md_trim_bio(mbio, r1_bio->sector - bio->bi_sector, max_sectors); - - if (first_clone) { - /* do behind I/O ? - * Not if there are too many, or cannot - * allocate memory, or a reader on WriteMostly - * is waiting for behind writes to flush */ - if (bitmap && - (atomic_read(&bitmap->behind_writes) - < mddev->bitmap_info.max_write_behind) && - !waitqueue_active(&bitmap->behind_wait)) - alloc_behind_pages(mbio, r1_bio); - - bitmap_startwrite(bitmap, r1_bio->sector, - r1_bio->sectors, - test_bit(R1BIO_BehindIO, - &r1_bio->state)); - first_clone = 0; - } - if (r1_bio->behind_bvecs) { + r1_bio->bios[i] = mbio; + + mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; + mbio->bi_bdev = conf->mirrors[i].rdev->bdev; + mbio->bi_end_io = raid1_end_write_request; + mbio->bi_rw = WRITE | do_flush_fua | do_sync; + mbio->bi_private = r1_bio; + + if (r1_bio->behind_pages) { struct bio_vec *bvec; int j; @@ -1080,20 +894,11 @@ static int make_request(mddev_t *mddev, struct bio * bio) * them all */ __bio_for_each_segment(bvec, mbio, j, 0) - bvec->bv_page = r1_bio->behind_bvecs[j].bv_page; + bvec->bv_page = r1_bio->behind_pages[j]; if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) atomic_inc(&r1_bio->behind_remaining); } - r1_bio->bios[i] = mbio; - - mbio->bi_sector = (r1_bio->sector + - conf->mirrors[i].rdev->data_offset); - mbio->bi_bdev = conf->mirrors[i].rdev->bdev; - mbio->bi_end_io = raid1_end_write_request; - mbio->bi_rw = WRITE | do_flush_fua | do_sync; - mbio->bi_private = r1_bio; - atomic_inc(&r1_bio->remaining); spin_lock_irqsave(&conf->device_lock, flags); bio_list_add(&conf->pending_bio_list, mbio); @@ -1104,19 +909,6 @@ static int make_request(mddev_t *mddev, struct bio * bio) /* In case raid1d snuck in to freeze_array */ wake_up(&conf->wait_barrier); - if (sectors_handled < (bio->bi_size >> 9)) { - /* We need another r1_bio. It has already been counted - * in bio->bi_phys_segments - */ - r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); - r1_bio->master_bio = bio; - r1_bio->sectors = (bio->bi_size >> 9) - sectors_handled; - r1_bio->state = 0; - r1_bio->mddev = mddev; - r1_bio->sector = bio->bi_sector + sectors_handled; - goto retry_write; - } - if (do_sync || !bitmap || !plugged) md_wakeup_thread(mddev->thread); @@ -1160,10 +952,9 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) * However don't try a recovery from this drive as * it is very likely to fail. */ - conf->recovery_disabled = mddev->recovery_disabled; + mddev->recovery_disabled = 1; return; } - set_bit(Blocked, &rdev->flags); if (test_and_clear_bit(In_sync, &rdev->flags)) { unsigned long flags; spin_lock_irqsave(&conf->device_lock, flags); @@ -1236,7 +1027,7 @@ static int raid1_spare_active(mddev_t *mddev) && !test_bit(Faulty, &rdev->flags) && !test_and_set_bit(In_sync, &rdev->flags)) { count++; - sysfs_notify_dirent_safe(rdev->sysfs_state); + sysfs_notify_dirent(rdev->sysfs_state); } } spin_lock_irqsave(&conf->device_lock, flags); @@ -1257,9 +1048,6 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) int first = 0; int last = mddev->raid_disks - 1; - if (mddev->recovery_disabled == conf->recovery_disabled) - return -EBUSY; - if (rdev->raid_disk >= 0) first = last = rdev->raid_disk; @@ -1315,7 +1103,7 @@ static int raid1_remove_disk(mddev_t *mddev, int number) * is not possible. */ if (!test_bit(Faulty, &rdev->flags) && - mddev->recovery_disabled != conf->recovery_disabled && + !mddev->recovery_disabled && mddev->degraded < conf->raid_disks) { err = -EBUSY; goto abort; @@ -1367,8 +1155,6 @@ static void end_sync_write(struct bio *bio, int error) conf_t *conf = mddev->private; int i; int mirror=0; - sector_t first_bad; - int bad_sectors; for (i = 0; i < conf->raid_disks; i++) if (r1_bio->bios[i] == bio) { @@ -1386,48 +1172,18 @@ static void end_sync_write(struct bio *bio, int error) s += sync_blocks; sectors_to_go -= sync_blocks; } while (sectors_to_go > 0); - set_bit(WriteErrorSeen, - &conf->mirrors[mirror].rdev->flags); - set_bit(R1BIO_WriteError, &r1_bio->state); - } else if (is_badblock(conf->mirrors[mirror].rdev, - r1_bio->sector, - r1_bio->sectors, - &first_bad, &bad_sectors) && - !is_badblock(conf->mirrors[r1_bio->read_disk].rdev, - r1_bio->sector, - r1_bio->sectors, - &first_bad, &bad_sectors) - ) - set_bit(R1BIO_MadeGood, &r1_bio->state); + md_error(mddev, conf->mirrors[mirror].rdev); + } update_head_pos(mirror, r1_bio); if (atomic_dec_and_test(&r1_bio->remaining)) { - int s = r1_bio->sectors; - if (test_bit(R1BIO_MadeGood, &r1_bio->state) || - test_bit(R1BIO_WriteError, &r1_bio->state)) - reschedule_retry(r1_bio); - else { - put_buf(r1_bio); - md_done_sync(mddev, s, uptodate); - } + sector_t s = r1_bio->sectors; + put_buf(r1_bio); + md_done_sync(mddev, s, uptodate); } } -static int r1_sync_page_io(mdk_rdev_t *rdev, sector_t sector, - int sectors, struct page *page, int rw) -{ - if (sync_page_io(rdev, sector, sectors << 9, page, rw, false)) - /* success */ - return 1; - if (rw == WRITE) - set_bit(WriteErrorSeen, &rdev->flags); - /* need to record an error - either for the block or the device */ - if (!rdev_set_badblocks(rdev, sector, sectors, 0)) - md_error(rdev->mddev, rdev); - return 0; -} - static int fix_sync_read_error(r1bio_t *r1_bio) { /* Try some synchronous reads of other devices to get @@ -1437,9 +1193,6 @@ static int fix_sync_read_error(r1bio_t *r1_bio) * We don't need to freeze the array, because being in an * active sync request, there is no normal IO, and * no overlapping syncs. - * We don't need to check is_badblock() again as we - * made sure that anything with a bad block in range - * will have bi_end_io clear. */ mddev_t *mddev = r1_bio->mddev; conf_t *conf = mddev->private; @@ -1464,7 +1217,9 @@ static int fix_sync_read_error(r1bio_t *r1_bio) * active, and resync is currently active */ rdev = conf->mirrors[d].rdev; - if (sync_page_io(rdev, sect, s<<9, + if (sync_page_io(rdev, + sect, + s<<9, bio->bi_io_vec[idx].bv_page, READ, false)) { success = 1; @@ -1478,36 +1233,16 @@ static int fix_sync_read_error(r1bio_t *r1_bio) if (!success) { char b[BDEVNAME_SIZE]; - int abort = 0; - /* Cannot read from anywhere, this block is lost. - * Record a bad block on each device. If that doesn't - * work just disable and interrupt the recovery. - * Don't fail devices as that won't really help. - */ + /* Cannot read from anywhere, array is toast */ + md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error" " for block %llu\n", mdname(mddev), bdevname(bio->bi_bdev, b), (unsigned long long)r1_bio->sector); - for (d = 0; d < conf->raid_disks; d++) { - rdev = conf->mirrors[d].rdev; - if (!rdev || test_bit(Faulty, &rdev->flags)) - continue; - if (!rdev_set_badblocks(rdev, sect, s, 0)) - abort = 1; - } - if (abort) { - mddev->recovery_disabled = 1; - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - md_done_sync(mddev, r1_bio->sectors, 0); - put_buf(r1_bio); - return 0; - } - /* Try next page */ - sectors -= s; - sect += s; - idx++; - continue; + md_done_sync(mddev, r1_bio->sectors, 0); + put_buf(r1_bio); + return 0; } start = d; @@ -1519,12 +1254,16 @@ static int fix_sync_read_error(r1bio_t *r1_bio) if (r1_bio->bios[d]->bi_end_io != end_sync_read) continue; rdev = conf->mirrors[d].rdev; - if (r1_sync_page_io(rdev, sect, s, - bio->bi_io_vec[idx].bv_page, - WRITE) == 0) { + if (sync_page_io(rdev, + sect, + s<<9, + bio->bi_io_vec[idx].bv_page, + WRITE, false) == 0) { r1_bio->bios[d]->bi_end_io = NULL; rdev_dec_pending(rdev, mddev); - } + md_error(mddev, rdev); + } else + atomic_add(s, &rdev->corrected_errors); } d = start; while (d != r1_bio->read_disk) { @@ -1534,10 +1273,12 @@ static int fix_sync_read_error(r1bio_t *r1_bio) if (r1_bio->bios[d]->bi_end_io != end_sync_read) continue; rdev = conf->mirrors[d].rdev; - if (r1_sync_page_io(rdev, sect, s, - bio->bi_io_vec[idx].bv_page, - READ) != 0) - atomic_add(s, &rdev->corrected_errors); + if (sync_page_io(rdev, + sect, + s<<9, + bio->bi_io_vec[idx].bv_page, + READ, false) == 0) + md_error(mddev, rdev); } sectors -= s; sect += s; @@ -1679,7 +1420,7 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) * * 1. Retries failed read operations on working mirrors. * 2. Updates the raid superblock when problems encounter. - * 3. Performs writes following reads for array synchronising. + * 3. Performs writes following reads for array syncronising. */ static void fix_read_error(conf_t *conf, int read_disk, @@ -1702,14 +1443,9 @@ static void fix_read_error(conf_t *conf, int read_disk, * which is the thread that might remove * a device. If raid1d ever becomes multi-threaded.... */ - sector_t first_bad; - int bad_sectors; - rdev = conf->mirrors[d].rdev; if (rdev && test_bit(In_sync, &rdev->flags) && - is_badblock(rdev, sect, s, - &first_bad, &bad_sectors) == 0 && sync_page_io(rdev, sect, s<<9, conf->tmppage, READ, false)) success = 1; @@ -1721,10 +1457,8 @@ static void fix_read_error(conf_t *conf, int read_disk, } while (!success && d != read_disk); if (!success) { - /* Cannot read from anywhere - mark it bad */ - mdk_rdev_t *rdev = conf->mirrors[read_disk].rdev; - if (!rdev_set_badblocks(rdev, sect, s, 0)) - md_error(mddev, rdev); + /* Cannot read from anywhere -- bye bye array */ + md_error(mddev, conf->mirrors[read_disk].rdev); break; } /* write it back and re-read */ @@ -1735,9 +1469,13 @@ static void fix_read_error(conf_t *conf, int read_disk, d--; rdev = conf->mirrors[d].rdev; if (rdev && - test_bit(In_sync, &rdev->flags)) - r1_sync_page_io(rdev, sect, s, - conf->tmppage, WRITE); + test_bit(In_sync, &rdev->flags)) { + if (sync_page_io(rdev, sect, s<<9, + conf->tmppage, WRITE, false) + == 0) + /* Well, this device is dead */ + md_error(mddev, rdev); + } } d = start; while (d != read_disk) { @@ -1748,8 +1486,12 @@ static void fix_read_error(conf_t *conf, int read_disk, rdev = conf->mirrors[d].rdev; if (rdev && test_bit(In_sync, &rdev->flags)) { - if (r1_sync_page_io(rdev, sect, s, - conf->tmppage, READ)) { + if (sync_page_io(rdev, sect, s<<9, + conf->tmppage, READ, false) + == 0) + /* Well, this device is dead */ + md_error(mddev, rdev); + else { atomic_add(s, &rdev->corrected_errors); printk(KERN_INFO "md/raid1:%s: read error corrected " @@ -1766,255 +1508,21 @@ static void fix_read_error(conf_t *conf, int read_disk, } } -static void bi_complete(struct bio *bio, int error) -{ - complete((struct completion *)bio->bi_private); -} - -static int submit_bio_wait(int rw, struct bio *bio) -{ - struct completion event; - rw |= REQ_SYNC; - - init_completion(&event); - bio->bi_private = &event; - bio->bi_end_io = bi_complete; - submit_bio(rw, bio); - wait_for_completion(&event); - - return test_bit(BIO_UPTODATE, &bio->bi_flags); -} - -static int narrow_write_error(r1bio_t *r1_bio, int i) -{ - mddev_t *mddev = r1_bio->mddev; - conf_t *conf = mddev->private; - mdk_rdev_t *rdev = conf->mirrors[i].rdev; - int vcnt, idx; - struct bio_vec *vec; - - /* bio has the data to be written to device 'i' where - * we just recently had a write error. - * We repeatedly clone the bio and trim down to one block, - * then try the write. Where the write fails we record - * a bad block. - * It is conceivable that the bio doesn't exactly align with - * blocks. We must handle this somehow. - * - * We currently own a reference on the rdev. - */ - - int block_sectors; - sector_t sector; - int sectors; - int sect_to_write = r1_bio->sectors; - int ok = 1; - - if (rdev->badblocks.shift < 0) - return 0; - - block_sectors = 1 << rdev->badblocks.shift; - sector = r1_bio->sector; - sectors = ((sector + block_sectors) - & ~(sector_t)(block_sectors - 1)) - - sector; - - if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { - vcnt = r1_bio->behind_page_count; - vec = r1_bio->behind_bvecs; - idx = 0; - while (vec[idx].bv_page == NULL) - idx++; - } else { - vcnt = r1_bio->master_bio->bi_vcnt; - vec = r1_bio->master_bio->bi_io_vec; - idx = r1_bio->master_bio->bi_idx; - } - while (sect_to_write) { - struct bio *wbio; - if (sectors > sect_to_write) - sectors = sect_to_write; - /* Write at 'sector' for 'sectors'*/ - - wbio = bio_alloc_mddev(GFP_NOIO, vcnt, mddev); - memcpy(wbio->bi_io_vec, vec, vcnt * sizeof(struct bio_vec)); - wbio->bi_sector = r1_bio->sector; - wbio->bi_rw = WRITE; - wbio->bi_vcnt = vcnt; - wbio->bi_size = r1_bio->sectors << 9; - wbio->bi_idx = idx; - - md_trim_bio(wbio, sector - r1_bio->sector, sectors); - wbio->bi_sector += rdev->data_offset; - wbio->bi_bdev = rdev->bdev; - if (submit_bio_wait(WRITE, wbio) == 0) - /* failure! */ - ok = rdev_set_badblocks(rdev, sector, - sectors, 0) - && ok; - - bio_put(wbio); - sect_to_write -= sectors; - sector += sectors; - sectors = block_sectors; - } - return ok; -} - -static void handle_sync_write_finished(conf_t *conf, r1bio_t *r1_bio) -{ - int m; - int s = r1_bio->sectors; - for (m = 0; m < conf->raid_disks ; m++) { - mdk_rdev_t *rdev = conf->mirrors[m].rdev; - struct bio *bio = r1_bio->bios[m]; - if (bio->bi_end_io == NULL) - continue; - if (test_bit(BIO_UPTODATE, &bio->bi_flags) && - test_bit(R1BIO_MadeGood, &r1_bio->state)) { - rdev_clear_badblocks(rdev, r1_bio->sector, s); - } - if (!test_bit(BIO_UPTODATE, &bio->bi_flags) && - test_bit(R1BIO_WriteError, &r1_bio->state)) { - if (!rdev_set_badblocks(rdev, r1_bio->sector, s, 0)) - md_error(conf->mddev, rdev); - } - } - put_buf(r1_bio); - md_done_sync(conf->mddev, s, 1); -} - -static void handle_write_finished(conf_t *conf, r1bio_t *r1_bio) -{ - int m; - for (m = 0; m < conf->raid_disks ; m++) - if (r1_bio->bios[m] == IO_MADE_GOOD) { - mdk_rdev_t *rdev = conf->mirrors[m].rdev; - rdev_clear_badblocks(rdev, - r1_bio->sector, - r1_bio->sectors); - rdev_dec_pending(rdev, conf->mddev); - } else if (r1_bio->bios[m] != NULL) { - /* This drive got a write error. We need to - * narrow down and record precise write - * errors. - */ - if (!narrow_write_error(r1_bio, m)) { - md_error(conf->mddev, - conf->mirrors[m].rdev); - /* an I/O failed, we can't clear the bitmap */ - set_bit(R1BIO_Degraded, &r1_bio->state); - } - rdev_dec_pending(conf->mirrors[m].rdev, - conf->mddev); - } - if (test_bit(R1BIO_WriteError, &r1_bio->state)) - close_write(r1_bio); - raid_end_bio_io(r1_bio); -} - -static void handle_read_error(conf_t *conf, r1bio_t *r1_bio) -{ - int disk; - int max_sectors; - mddev_t *mddev = conf->mddev; - struct bio *bio; - char b[BDEVNAME_SIZE]; - mdk_rdev_t *rdev; - - clear_bit(R1BIO_ReadError, &r1_bio->state); - /* we got a read error. Maybe the drive is bad. Maybe just - * the block and we can fix it. - * We freeze all other IO, and try reading the block from - * other devices. When we find one, we re-write - * and check it that fixes the read error. - * This is all done synchronously while the array is - * frozen - */ - if (mddev->ro == 0) { - freeze_array(conf); - fix_read_error(conf, r1_bio->read_disk, - r1_bio->sector, r1_bio->sectors); - unfreeze_array(conf); - } else - md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); - - bio = r1_bio->bios[r1_bio->read_disk]; - bdevname(bio->bi_bdev, b); -read_more: - disk = read_balance(conf, r1_bio, &max_sectors); - if (disk == -1) { - printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O" - " read error for block %llu\n", - mdname(mddev), b, (unsigned long long)r1_bio->sector); - raid_end_bio_io(r1_bio); - } else { - const unsigned long do_sync - = r1_bio->master_bio->bi_rw & REQ_SYNC; - if (bio) { - r1_bio->bios[r1_bio->read_disk] = - mddev->ro ? IO_BLOCKED : NULL; - bio_put(bio); - } - r1_bio->read_disk = disk; - bio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev); - md_trim_bio(bio, r1_bio->sector - bio->bi_sector, max_sectors); - r1_bio->bios[r1_bio->read_disk] = bio; - rdev = conf->mirrors[disk].rdev; - printk_ratelimited(KERN_ERR - "md/raid1:%s: redirecting sector %llu" - " to other mirror: %s\n", - mdname(mddev), - (unsigned long long)r1_bio->sector, - bdevname(rdev->bdev, b)); - bio->bi_sector = r1_bio->sector + rdev->data_offset; - bio->bi_bdev = rdev->bdev; - bio->bi_end_io = raid1_end_read_request; - bio->bi_rw = READ | do_sync; - bio->bi_private = r1_bio; - if (max_sectors < r1_bio->sectors) { - /* Drat - have to split this up more */ - struct bio *mbio = r1_bio->master_bio; - int sectors_handled = (r1_bio->sector + max_sectors - - mbio->bi_sector); - r1_bio->sectors = max_sectors; - spin_lock_irq(&conf->device_lock); - if (mbio->bi_phys_segments == 0) - mbio->bi_phys_segments = 2; - else - mbio->bi_phys_segments++; - spin_unlock_irq(&conf->device_lock); - generic_make_request(bio); - bio = NULL; - - r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); - - r1_bio->master_bio = mbio; - r1_bio->sectors = (mbio->bi_size >> 9) - - sectors_handled; - r1_bio->state = 0; - set_bit(R1BIO_ReadError, &r1_bio->state); - r1_bio->mddev = mddev; - r1_bio->sector = mbio->bi_sector + sectors_handled; - - goto read_more; - } else - generic_make_request(bio); - } -} - static void raid1d(mddev_t *mddev) { r1bio_t *r1_bio; + struct bio *bio; unsigned long flags; conf_t *conf = mddev->private; struct list_head *head = &conf->retry_list; + mdk_rdev_t *rdev; struct blk_plug plug; md_check_recovery(mddev); blk_start_plug(&plug); for (;;) { + char b[BDEVNAME_SIZE]; if (atomic_read(&mddev->plug_cnt) == 0) flush_pending_writes(conf); @@ -2031,26 +1539,62 @@ static void raid1d(mddev_t *mddev) mddev = r1_bio->mddev; conf = mddev->private; - if (test_bit(R1BIO_IsSync, &r1_bio->state)) { - if (test_bit(R1BIO_MadeGood, &r1_bio->state) || - test_bit(R1BIO_WriteError, &r1_bio->state)) - handle_sync_write_finished(conf, r1_bio); - else - sync_request_write(mddev, r1_bio); - } else if (test_bit(R1BIO_MadeGood, &r1_bio->state) || - test_bit(R1BIO_WriteError, &r1_bio->state)) - handle_write_finished(conf, r1_bio); - else if (test_bit(R1BIO_ReadError, &r1_bio->state)) - handle_read_error(conf, r1_bio); - else - /* just a partial read to be scheduled from separate - * context + if (test_bit(R1BIO_IsSync, &r1_bio->state)) + sync_request_write(mddev, r1_bio); + else { + int disk; + + /* we got a read error. Maybe the drive is bad. Maybe just + * the block and we can fix it. + * We freeze all other IO, and try reading the block from + * other devices. When we find one, we re-write + * and check it that fixes the read error. + * This is all done synchronously while the array is + * frozen */ - generic_make_request(r1_bio->bios[r1_bio->read_disk]); - + if (mddev->ro == 0) { + freeze_array(conf); + fix_read_error(conf, r1_bio->read_disk, + r1_bio->sector, + r1_bio->sectors); + unfreeze_array(conf); + } else + md_error(mddev, + conf->mirrors[r1_bio->read_disk].rdev); + + bio = r1_bio->bios[r1_bio->read_disk]; + if ((disk=read_balance(conf, r1_bio)) == -1) { + printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O" + " read error for block %llu\n", + mdname(mddev), + bdevname(bio->bi_bdev,b), + (unsigned long long)r1_bio->sector); + raid_end_bio_io(r1_bio); + } else { + const unsigned long do_sync = r1_bio->master_bio->bi_rw & REQ_SYNC; + r1_bio->bios[r1_bio->read_disk] = + mddev->ro ? IO_BLOCKED : NULL; + r1_bio->read_disk = disk; + bio_put(bio); + bio = bio_clone_mddev(r1_bio->master_bio, + GFP_NOIO, mddev); + r1_bio->bios[r1_bio->read_disk] = bio; + rdev = conf->mirrors[disk].rdev; + if (printk_ratelimit()) + printk(KERN_ERR "md/raid1:%s: redirecting sector %llu to" + " other mirror: %s\n", + mdname(mddev), + (unsigned long long)r1_bio->sector, + bdevname(rdev->bdev,b)); + bio->bi_sector = r1_bio->sector + rdev->data_offset; + bio->bi_bdev = rdev->bdev; + bio->bi_end_io = raid1_end_read_request; + bio->bi_rw = READ | do_sync; + bio->bi_private = r1_bio; + generic_make_request(bio); + } + } cond_resched(); - if (mddev->flags & ~(1<r1buf_pool) if (init_resync(conf)) @@ -2181,89 +1723,36 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i rdev = rcu_dereference(conf->mirrors[i].rdev); if (rdev == NULL || - test_bit(Faulty, &rdev->flags)) { + test_bit(Faulty, &rdev->flags)) { still_degraded = 1; + continue; } else if (!test_bit(In_sync, &rdev->flags)) { bio->bi_rw = WRITE; bio->bi_end_io = end_sync_write; write_targets ++; } else { /* may need to read from here */ - sector_t first_bad = MaxSector; - int bad_sectors; - - if (is_badblock(rdev, sector_nr, good_sectors, - &first_bad, &bad_sectors)) { - if (first_bad > sector_nr) - good_sectors = first_bad - sector_nr; - else { - bad_sectors -= (sector_nr - first_bad); - if (min_bad == 0 || - min_bad > bad_sectors) - min_bad = bad_sectors; - } - } - if (sector_nr < first_bad) { - if (test_bit(WriteMostly, &rdev->flags)) { - if (wonly < 0) - wonly = i; - } else { - if (disk < 0) - disk = i; - } - bio->bi_rw = READ; - bio->bi_end_io = end_sync_read; - read_targets++; + bio->bi_rw = READ; + bio->bi_end_io = end_sync_read; + if (test_bit(WriteMostly, &rdev->flags)) { + if (wonly < 0) + wonly = i; + } else { + if (disk < 0) + disk = i; } + read_targets++; } - if (bio->bi_end_io) { - atomic_inc(&rdev->nr_pending); - bio->bi_sector = sector_nr + rdev->data_offset; - bio->bi_bdev = rdev->bdev; - bio->bi_private = r1_bio; - } + atomic_inc(&rdev->nr_pending); + bio->bi_sector = sector_nr + rdev->data_offset; + bio->bi_bdev = rdev->bdev; + bio->bi_private = r1_bio; } rcu_read_unlock(); if (disk < 0) disk = wonly; r1_bio->read_disk = disk; - if (read_targets == 0 && min_bad > 0) { - /* These sectors are bad on all InSync devices, so we - * need to mark them bad on all write targets - */ - int ok = 1; - for (i = 0 ; i < conf->raid_disks ; i++) - if (r1_bio->bios[i]->bi_end_io == end_sync_write) { - mdk_rdev_t *rdev = - rcu_dereference(conf->mirrors[i].rdev); - ok = rdev_set_badblocks(rdev, sector_nr, - min_bad, 0 - ) && ok; - } - set_bit(MD_CHANGE_DEVS, &mddev->flags); - *skipped = 1; - put_buf(r1_bio); - - if (!ok) { - /* Cannot record the badblocks, so need to - * abort the resync. - * If there are multiple read targets, could just - * fail the really bad ones ??? - */ - conf->recovery_disabled = mddev->recovery_disabled; - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - return 0; - } else - return min_bad; - - } - if (min_bad > 0 && min_bad < good_sectors) { - /* only resync enough to reach the next bad->good - * transition */ - good_sectors = min_bad; - } - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && read_targets > 0) /* extra read targets are also write targets */ write_targets += read_targets-1; @@ -2280,8 +1769,6 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i if (max_sector > mddev->resync_max) max_sector = mddev->resync_max; /* Don't do IO beyond here */ - if (max_sector > sector_nr + good_sectors) - max_sector = sector_nr + good_sectors; nr_sectors = 0; sync_blocks = 0; do { @@ -2667,13 +2154,18 @@ static int raid1_reshape(mddev_t *mddev) for (d = d2 = 0; d < conf->raid_disks; d++) { mdk_rdev_t *rdev = conf->mirrors[d].rdev; if (rdev && rdev->raid_disk != d2) { - sysfs_unlink_rdev(mddev, rdev); + char nm[20]; + sprintf(nm, "rd%d", rdev->raid_disk); + sysfs_remove_link(&mddev->kobj, nm); rdev->raid_disk = d2; - sysfs_unlink_rdev(mddev, rdev); - if (sysfs_link_rdev(mddev, rdev)) + sprintf(nm, "rd%d", rdev->raid_disk); + sysfs_remove_link(&mddev->kobj, nm); + if (sysfs_create_link(&mddev->kobj, + &rdev->kobj, nm)) printk(KERN_WARNING - "md/raid1:%s: cannot register rd%d\n", - mdname(mddev), rdev->raid_disk); + "md/raid1:%s: cannot register " + "%s\n", + mdname(mddev), nm); } if (rdev) newmirrors[d2++].rdev = rdev; diff --git a/trunk/drivers/md/raid1.h b/trunk/drivers/md/raid1.h index e0d676b48974..e743a64fac4f 100644 --- a/trunk/drivers/md/raid1.h +++ b/trunk/drivers/md/raid1.h @@ -48,12 +48,6 @@ struct r1_private_data_s { * (fresh device added). * Cleared when a sync completes. */ - int recovery_disabled; /* when the same as - * mddev->recovery_disabled - * we don't allow recovery - * to be attempted as we - * expect a read error - */ wait_queue_head_t wait_barrier; @@ -101,7 +95,7 @@ struct r1bio_s { struct list_head retry_list; /* Next two are only valid when R1BIO_BehindIO is set */ - struct bio_vec *behind_bvecs; + struct page **behind_pages; int behind_page_count; /* * if the IO is in WRITE direction, then multiple bios are used. @@ -116,24 +110,13 @@ struct r1bio_s { * correct the read error. To keep track of bad blocks on a per-bio * level, we store IO_BLOCKED in the appropriate 'bios' pointer */ -#define IO_BLOCKED ((struct bio *)1) -/* When we successfully write to a known bad-block, we need to remove the - * bad-block marking which must be done from process context. So we record - * the success by setting bios[n] to IO_MADE_GOOD - */ -#define IO_MADE_GOOD ((struct bio *)2) - -#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) +#define IO_BLOCKED ((struct bio*)1) /* bits for r1bio.state */ #define R1BIO_Uptodate 0 #define R1BIO_IsSync 1 #define R1BIO_Degraded 2 #define R1BIO_BehindIO 3 -/* Set ReadError on bios that experience a readerror so that - * raid1d knows what to do with them. - */ -#define R1BIO_ReadError 4 /* For write-behind requests, we call bi_end_io when * the last non-write-behind device completes, providing * any write was successful. Otherwise we call when @@ -142,11 +125,6 @@ struct r1bio_s { * Record that bi_end_io was called with this flag... */ #define R1BIO_Returned 6 -/* If a write for this request means we can clear some - * known-bad-block records, we set this flag - */ -#define R1BIO_MadeGood 7 -#define R1BIO_WriteError 8 extern int md_raid1_congested(mddev_t *mddev, int bits); diff --git a/trunk/drivers/md/raid10.c b/trunk/drivers/md/raid10.c index 8b29cd4f01c8..6e846688962f 100644 --- a/trunk/drivers/md/raid10.c +++ b/trunk/drivers/md/raid10.c @@ -22,7 +22,6 @@ #include #include #include -#include #include "md.h" #include "raid10.h" #include "raid0.h" @@ -124,14 +123,7 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) for (j = 0 ; j < nalloc; j++) { bio = r10_bio->devs[j].bio; for (i = 0; i < RESYNC_PAGES; i++) { - if (j == 1 && !test_bit(MD_RECOVERY_SYNC, - &conf->mddev->recovery)) { - /* we can share bv_page's during recovery */ - struct bio *rbio = r10_bio->devs[0].bio; - page = rbio->bi_io_vec[i].bv_page; - get_page(page); - } else - page = alloc_page(gfp_flags); + page = alloc_page(gfp_flags); if (unlikely(!page)) goto out_free_pages; @@ -181,7 +173,7 @@ static void put_all_bios(conf_t *conf, r10bio_t *r10_bio) for (i = 0; i < conf->copies; i++) { struct bio **bio = & r10_bio->devs[i].bio; - if (!BIO_SPECIAL(*bio)) + if (*bio && *bio != IO_BLOCKED) bio_put(*bio); *bio = NULL; } @@ -191,6 +183,12 @@ static void free_r10bio(r10bio_t *r10_bio) { conf_t *conf = r10_bio->mddev->private; + /* + * Wake up any possible resync thread that waits for the device + * to go idle. + */ + allow_barrier(conf); + put_all_bios(conf, r10_bio); mempool_free(r10_bio, conf->r10bio_pool); } @@ -229,27 +227,9 @@ static void reschedule_retry(r10bio_t *r10_bio) static void raid_end_bio_io(r10bio_t *r10_bio) { struct bio *bio = r10_bio->master_bio; - int done; - conf_t *conf = r10_bio->mddev->private; - if (bio->bi_phys_segments) { - unsigned long flags; - spin_lock_irqsave(&conf->device_lock, flags); - bio->bi_phys_segments--; - done = (bio->bi_phys_segments == 0); - spin_unlock_irqrestore(&conf->device_lock, flags); - } else - done = 1; - if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) - clear_bit(BIO_UPTODATE, &bio->bi_flags); - if (done) { - bio_endio(bio, 0); - /* - * Wake up any possible resync thread that waits for the device - * to go idle. - */ - allow_barrier(conf); - } + bio_endio(bio, + test_bit(R10BIO_Uptodate, &r10_bio->state) ? 0 : -EIO); free_r10bio(r10_bio); } @@ -264,26 +244,6 @@ static inline void update_head_pos(int slot, r10bio_t *r10_bio) r10_bio->devs[slot].addr + (r10_bio->sectors); } -/* - * Find the disk number which triggered given bio - */ -static int find_bio_disk(conf_t *conf, r10bio_t *r10_bio, - struct bio *bio, int *slotp) -{ - int slot; - - for (slot = 0; slot < conf->copies; slot++) - if (r10_bio->devs[slot].bio == bio) - break; - - BUG_ON(slot == conf->copies); - update_head_pos(slot, r10_bio); - - if (slotp) - *slotp = slot; - return r10_bio->devs[slot].devnum; -} - static void raid10_end_read_request(struct bio *bio, int error) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); @@ -317,45 +277,34 @@ static void raid10_end_read_request(struct bio *bio, int error) * oops, read error - keep the refcount on the rdev */ char b[BDEVNAME_SIZE]; - printk_ratelimited(KERN_ERR - "md/raid10:%s: %s: rescheduling sector %llu\n", - mdname(conf->mddev), - bdevname(conf->mirrors[dev].rdev->bdev, b), - (unsigned long long)r10_bio->sector); - set_bit(R10BIO_ReadError, &r10_bio->state); + if (printk_ratelimit()) + printk(KERN_ERR "md/raid10:%s: %s: rescheduling sector %llu\n", + mdname(conf->mddev), + bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector); reschedule_retry(r10_bio); } } -static void close_write(r10bio_t *r10_bio) -{ - /* clear the bitmap if all writes complete successfully */ - bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector, - r10_bio->sectors, - !test_bit(R10BIO_Degraded, &r10_bio->state), - 0); - md_write_end(r10_bio->mddev); -} - static void raid10_end_write_request(struct bio *bio, int error) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); r10bio_t *r10_bio = bio->bi_private; - int dev; - int dec_rdev = 1; + int slot, dev; conf_t *conf = r10_bio->mddev->private; - int slot; - dev = find_bio_disk(conf, r10_bio, bio, &slot); + for (slot = 0; slot < conf->copies; slot++) + if (r10_bio->devs[slot].bio == bio) + break; + dev = r10_bio->devs[slot].devnum; /* * this branch is our 'one mirror IO has finished' event handler: */ if (!uptodate) { - set_bit(WriteErrorSeen, &conf->mirrors[dev].rdev->flags); - set_bit(R10BIO_WriteError, &r10_bio->state); - dec_rdev = 0; - } else { + md_error(r10_bio->mddev, conf->mirrors[dev].rdev); + /* an I/O failed, we can't clear the bitmap */ + set_bit(R10BIO_Degraded, &r10_bio->state); + } else /* * Set R10BIO_Uptodate in our master bio, so that * we will return a good error code for to the higher @@ -365,22 +314,9 @@ static void raid10_end_write_request(struct bio *bio, int error) * user-side. So if something waits for IO, then it will * wait for the 'master' bio. */ - sector_t first_bad; - int bad_sectors; - set_bit(R10BIO_Uptodate, &r10_bio->state); - /* Maybe we can clear some bad blocks. */ - if (is_badblock(conf->mirrors[dev].rdev, - r10_bio->devs[slot].addr, - r10_bio->sectors, - &first_bad, &bad_sectors)) { - bio_put(bio); - r10_bio->devs[slot].bio = IO_MADE_GOOD; - dec_rdev = 0; - set_bit(R10BIO_MadeGood, &r10_bio->state); - } - } + update_head_pos(slot, r10_bio); /* * @@ -388,18 +324,16 @@ static void raid10_end_write_request(struct bio *bio, int error) * already. */ if (atomic_dec_and_test(&r10_bio->remaining)) { - if (test_bit(R10BIO_WriteError, &r10_bio->state)) - reschedule_retry(r10_bio); - else { - close_write(r10_bio); - if (test_bit(R10BIO_MadeGood, &r10_bio->state)) - reschedule_retry(r10_bio); - else - raid_end_bio_io(r10_bio); - } + /* clear the bitmap if all writes complete successfully */ + bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector, + r10_bio->sectors, + !test_bit(R10BIO_Degraded, &r10_bio->state), + 0); + md_write_end(r10_bio->mddev); + raid_end_bio_io(r10_bio); } - if (dec_rdev) - rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); + + rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); } @@ -550,12 +484,11 @@ static int raid10_mergeable_bvec(struct request_queue *q, * FIXME: possibly should rethink readbalancing and do it differently * depending on near_copies / far_copies geometry. */ -static int read_balance(conf_t *conf, r10bio_t *r10_bio, int *max_sectors) +static int read_balance(conf_t *conf, r10bio_t *r10_bio) { const sector_t this_sector = r10_bio->sector; int disk, slot; - int sectors = r10_bio->sectors; - int best_good_sectors; + const int sectors = r10_bio->sectors; sector_t new_distance, best_dist; mdk_rdev_t *rdev; int do_balance; @@ -564,10 +497,8 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio, int *max_sectors) raid10_find_phys(conf, r10_bio); rcu_read_lock(); retry: - sectors = r10_bio->sectors; best_slot = -1; best_dist = MaxSector; - best_good_sectors = 0; do_balance = 1; /* * Check if we can balance. We can balance on the whole @@ -580,10 +511,6 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio, int *max_sectors) do_balance = 0; for (slot = 0; slot < conf->copies ; slot++) { - sector_t first_bad; - int bad_sectors; - sector_t dev_sector; - if (r10_bio->devs[slot].bio == IO_BLOCKED) continue; disk = r10_bio->devs[slot].devnum; @@ -593,37 +520,6 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio, int *max_sectors) if (!test_bit(In_sync, &rdev->flags)) continue; - dev_sector = r10_bio->devs[slot].addr; - if (is_badblock(rdev, dev_sector, sectors, - &first_bad, &bad_sectors)) { - if (best_dist < MaxSector) - /* Already have a better slot */ - continue; - if (first_bad <= dev_sector) { - /* Cannot read here. If this is the - * 'primary' device, then we must not read - * beyond 'bad_sectors' from another device. - */ - bad_sectors -= (dev_sector - first_bad); - if (!do_balance && sectors > bad_sectors) - sectors = bad_sectors; - if (best_good_sectors > sectors) - best_good_sectors = sectors; - } else { - sector_t good_sectors = - first_bad - dev_sector; - if (good_sectors > best_good_sectors) { - best_good_sectors = good_sectors; - best_slot = slot; - } - if (!do_balance) - /* Must read from here */ - break; - } - continue; - } else - best_good_sectors = sectors; - if (!do_balance) break; @@ -665,7 +561,6 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio, int *max_sectors) } else disk = -1; rcu_read_unlock(); - *max_sectors = best_good_sectors; return disk; } @@ -839,8 +734,6 @@ static int make_request(mddev_t *mddev, struct bio * bio) unsigned long flags; mdk_rdev_t *blocked_rdev; int plugged; - int sectors_handled; - int max_sectors; if (unlikely(bio->bi_rw & REQ_FLUSH)) { md_flush_request(mddev, bio); @@ -915,26 +808,12 @@ static int make_request(mddev_t *mddev, struct bio * bio) r10_bio->sector = bio->bi_sector; r10_bio->state = 0; - /* We might need to issue multiple reads to different - * devices if there are bad blocks around, so we keep - * track of the number of reads in bio->bi_phys_segments. - * If this is 0, there is only one r10_bio and no locking - * will be needed when the request completes. If it is - * non-zero, then it is the number of not-completed requests. - */ - bio->bi_phys_segments = 0; - clear_bit(BIO_SEG_VALID, &bio->bi_flags); - if (rw == READ) { /* * read balancing logic: */ - int disk; - int slot; - -read_again: - disk = read_balance(conf, r10_bio, &max_sectors); - slot = r10_bio->read_slot; + int disk = read_balance(conf, r10_bio); + int slot = r10_bio->read_slot; if (disk < 0) { raid_end_bio_io(r10_bio); return 0; @@ -942,8 +821,6 @@ static int make_request(mddev_t *mddev, struct bio * bio) mirror = conf->mirrors + disk; read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); - md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector, - max_sectors); r10_bio->devs[slot].bio = read_bio; @@ -954,37 +831,7 @@ static int make_request(mddev_t *mddev, struct bio * bio) read_bio->bi_rw = READ | do_sync; read_bio->bi_private = r10_bio; - if (max_sectors < r10_bio->sectors) { - /* Could not read all from this device, so we will - * need another r10_bio. - */ - sectors_handled = (r10_bio->sectors + max_sectors - - bio->bi_sector); - r10_bio->sectors = max_sectors; - spin_lock_irq(&conf->device_lock); - if (bio->bi_phys_segments == 0) - bio->bi_phys_segments = 2; - else - bio->bi_phys_segments++; - spin_unlock(&conf->device_lock); - /* Cannot call generic_make_request directly - * as that will be queued in __generic_make_request - * and subsequent mempool_alloc might block - * waiting for it. so hand bio over to raid10d. - */ - reschedule_retry(r10_bio); - - r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); - - r10_bio->master_bio = bio; - r10_bio->sectors = ((bio->bi_size >> 9) - - sectors_handled); - r10_bio->state = 0; - r10_bio->mddev = mddev; - r10_bio->sector = bio->bi_sector + sectors_handled; - goto read_again; - } else - generic_make_request(read_bio); + generic_make_request(read_bio); return 0; } @@ -994,22 +841,13 @@ static int make_request(mddev_t *mddev, struct bio * bio) /* first select target devices under rcu_lock and * inc refcount on their rdev. Record them by setting * bios[x] to bio - * If there are known/acknowledged bad blocks on any device - * on which we have seen a write error, we want to avoid - * writing to those blocks. This potentially requires several - * writes to write around the bad blocks. Each set of writes - * gets its own r10_bio with a set of bios attached. The number - * of r10_bios is recored in bio->bi_phys_segments just as with - * the read case. */ plugged = mddev_check_plugged(mddev); raid10_find_phys(conf, r10_bio); -retry_write: + retry_write: blocked_rdev = NULL; rcu_read_lock(); - max_sectors = r10_bio->sectors; - for (i = 0; i < conf->copies; i++) { int d = r10_bio->devs[i].devnum; mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev); @@ -1018,55 +856,13 @@ static int make_request(mddev_t *mddev, struct bio * bio) blocked_rdev = rdev; break; } - r10_bio->devs[i].bio = NULL; - if (!rdev || test_bit(Faulty, &rdev->flags)) { + if (rdev && !test_bit(Faulty, &rdev->flags)) { + atomic_inc(&rdev->nr_pending); + r10_bio->devs[i].bio = bio; + } else { + r10_bio->devs[i].bio = NULL; set_bit(R10BIO_Degraded, &r10_bio->state); - continue; } - if (test_bit(WriteErrorSeen, &rdev->flags)) { - sector_t first_bad; - sector_t dev_sector = r10_bio->devs[i].addr; - int bad_sectors; - int is_bad; - - is_bad = is_badblock(rdev, dev_sector, - max_sectors, - &first_bad, &bad_sectors); - if (is_bad < 0) { - /* Mustn't write here until the bad block - * is acknowledged - */ - atomic_inc(&rdev->nr_pending); - set_bit(BlockedBadBlocks, &rdev->flags); - blocked_rdev = rdev; - break; - } - if (is_bad && first_bad <= dev_sector) { - /* Cannot write here at all */ - bad_sectors -= (dev_sector - first_bad); - if (bad_sectors < max_sectors) - /* Mustn't write more than bad_sectors - * to other devices yet - */ - max_sectors = bad_sectors; - /* We don't set R10BIO_Degraded as that - * only applies if the disk is missing, - * so it might be re-added, and we want to - * know to recover this chunk. - * In this case the device is here, and the - * fact that this chunk is not in-sync is - * recorded in the bad block log. - */ - continue; - } - if (is_bad) { - int good_sectors = first_bad - dev_sector; - if (good_sectors < max_sectors) - max_sectors = good_sectors; - } - } - r10_bio->devs[i].bio = bio; - atomic_inc(&rdev->nr_pending); } rcu_read_unlock(); @@ -1086,22 +882,8 @@ static int make_request(mddev_t *mddev, struct bio * bio) goto retry_write; } - if (max_sectors < r10_bio->sectors) { - /* We are splitting this into multiple parts, so - * we need to prepare for allocating another r10_bio. - */ - r10_bio->sectors = max_sectors; - spin_lock_irq(&conf->device_lock); - if (bio->bi_phys_segments == 0) - bio->bi_phys_segments = 2; - else - bio->bi_phys_segments++; - spin_unlock_irq(&conf->device_lock); - } - sectors_handled = r10_bio->sector + max_sectors - bio->bi_sector; - atomic_set(&r10_bio->remaining, 1); - bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0); + bitmap_startwrite(mddev->bitmap, bio->bi_sector, r10_bio->sectors, 0); for (i = 0; i < conf->copies; i++) { struct bio *mbio; @@ -1110,12 +892,10 @@ static int make_request(mddev_t *mddev, struct bio * bio) continue; mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); - md_trim_bio(mbio, r10_bio->sector - bio->bi_sector, - max_sectors); r10_bio->devs[i].bio = mbio; - mbio->bi_sector = (r10_bio->devs[i].addr+ - conf->mirrors[d].rdev->data_offset); + mbio->bi_sector = r10_bio->devs[i].addr+ + conf->mirrors[d].rdev->data_offset; mbio->bi_bdev = conf->mirrors[d].rdev->bdev; mbio->bi_end_io = raid10_end_write_request; mbio->bi_rw = WRITE | do_sync | do_fua; @@ -1140,21 +920,6 @@ static int make_request(mddev_t *mddev, struct bio * bio) /* In case raid10d snuck in to freeze_array */ wake_up(&conf->wait_barrier); - if (sectors_handled < (bio->bi_size >> 9)) { - /* We need another r10_bio. It has already been counted - * in bio->bi_phys_segments. - */ - r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); - - r10_bio->master_bio = bio; - r10_bio->sectors = (bio->bi_size >> 9) - sectors_handled; - - r10_bio->mddev = mddev; - r10_bio->sector = bio->bi_sector + sectors_handled; - r10_bio->state = 0; - goto retry_write; - } - if (do_sync || !mddev->bitmap || !plugged) md_wakeup_thread(mddev->thread); return 0; @@ -1184,30 +949,6 @@ static void status(struct seq_file *seq, mddev_t *mddev) seq_printf(seq, "]"); } -/* check if there are enough drives for - * every block to appear on atleast one. - * Don't consider the device numbered 'ignore' - * as we might be about to remove it. - */ -static int enough(conf_t *conf, int ignore) -{ - int first = 0; - - do { - int n = conf->copies; - int cnt = 0; - while (n--) { - if (conf->mirrors[first].rdev && - first != ignore) - cnt++; - first = (first+1) % conf->raid_disks; - } - if (cnt == 0) - return 0; - } while (first != 0); - return 1; -} - static void error(mddev_t *mddev, mdk_rdev_t *rdev) { char b[BDEVNAME_SIZE]; @@ -1220,9 +961,13 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) * else mark the drive as failed */ if (test_bit(In_sync, &rdev->flags) - && !enough(conf, rdev->raid_disk)) + && conf->raid_disks-mddev->degraded == 1) /* * Don't fail the drive, just return an IO error. + * The test should really be more sophisticated than + * "working_disks == 1", but it isn't critical, and + * can wait until we do more sophisticated "is the drive + * really dead" tests... */ return; if (test_and_clear_bit(In_sync, &rdev->flags)) { @@ -1235,7 +980,6 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) */ set_bit(MD_RECOVERY_INTR, &mddev->recovery); } - set_bit(Blocked, &rdev->flags); set_bit(Faulty, &rdev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags); printk(KERN_ALERT @@ -1278,6 +1022,27 @@ static void close_sync(conf_t *conf) conf->r10buf_pool = NULL; } +/* check if there are enough drives for + * every block to appear on atleast one + */ +static int enough(conf_t *conf) +{ + int first = 0; + + do { + int n = conf->copies; + int cnt = 0; + while (n--) { + if (conf->mirrors[first].rdev) + cnt++; + first = (first+1) % conf->raid_disks; + } + if (cnt == 0) + return 0; + } while (first != 0); + return 1; +} + static int raid10_spare_active(mddev_t *mddev) { int i; @@ -1313,6 +1078,7 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) conf_t *conf = mddev->private; int err = -EEXIST; int mirror; + mirror_info_t *p; int first = 0; int last = conf->raid_disks - 1; @@ -1321,47 +1087,44 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) * very different from resync */ return -EBUSY; - if (!enough(conf, -1)) + if (!enough(conf)) return -EINVAL; if (rdev->raid_disk >= 0) first = last = rdev->raid_disk; - if (rdev->saved_raid_disk >= first && + if (rdev->saved_raid_disk >= 0 && + rdev->saved_raid_disk >= first && conf->mirrors[rdev->saved_raid_disk].rdev == NULL) mirror = rdev->saved_raid_disk; else mirror = first; - for ( ; mirror <= last ; mirror++) { - mirror_info_t *p = &conf->mirrors[mirror]; - if (p->recovery_disabled == mddev->recovery_disabled) - continue; - if (!p->rdev) - continue; + for ( ; mirror <= last ; mirror++) + if ( !(p=conf->mirrors+mirror)->rdev) { + + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); + /* as we don't honour merge_bvec_fn, we must + * never risk violating it, so limit + * ->max_segments to one lying with a single + * page, as a one page request is never in + * violation. + */ + if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { + blk_queue_max_segments(mddev->queue, 1); + blk_queue_segment_boundary(mddev->queue, + PAGE_CACHE_SIZE - 1); + } - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->data_offset << 9); - /* as we don't honour merge_bvec_fn, we must - * never risk violating it, so limit - * ->max_segments to one lying with a single - * page, as a one page request is never in - * violation. - */ - if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { - blk_queue_max_segments(mddev->queue, 1); - blk_queue_segment_boundary(mddev->queue, - PAGE_CACHE_SIZE - 1); + p->head_position = 0; + rdev->raid_disk = mirror; + err = 0; + if (rdev->saved_raid_disk != mirror) + conf->fullsync = 1; + rcu_assign_pointer(p->rdev, rdev); + break; } - p->head_position = 0; - rdev->raid_disk = mirror; - err = 0; - if (rdev->saved_raid_disk != mirror) - conf->fullsync = 1; - rcu_assign_pointer(p->rdev, rdev); - break; - } - md_integrity_add_rdev(rdev, mddev); print_conf(conf); return err; @@ -1386,8 +1149,7 @@ static int raid10_remove_disk(mddev_t *mddev, int number) * is not possible. */ if (!test_bit(Faulty, &rdev->flags) && - mddev->recovery_disabled != p->recovery_disabled && - enough(conf, -1)) { + enough(conf)) { err = -EBUSY; goto abort; } @@ -1412,18 +1174,24 @@ static void end_sync_read(struct bio *bio, int error) { r10bio_t *r10_bio = bio->bi_private; conf_t *conf = r10_bio->mddev->private; - int d; + int i,d; - d = find_bio_disk(conf, r10_bio, bio, NULL); + for (i=0; icopies; i++) + if (r10_bio->devs[i].bio == bio) + break; + BUG_ON(i == conf->copies); + update_head_pos(i, r10_bio); + d = r10_bio->devs[i].devnum; if (test_bit(BIO_UPTODATE, &bio->bi_flags)) set_bit(R10BIO_Uptodate, &r10_bio->state); - else - /* The write handler will notice the lack of - * R10BIO_Uptodate and record any errors etc - */ + else { atomic_add(r10_bio->sectors, &conf->mirrors[d].rdev->corrected_errors); + if (!test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery)) + md_error(r10_bio->mddev, + conf->mirrors[d].rdev); + } /* for reconstruct, we always reschedule after a read. * for resync, only after all reads @@ -1438,60 +1206,40 @@ static void end_sync_read(struct bio *bio, int error) } } -static void end_sync_request(r10bio_t *r10_bio) +static void end_sync_write(struct bio *bio, int error) { + int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + r10bio_t *r10_bio = bio->bi_private; mddev_t *mddev = r10_bio->mddev; + conf_t *conf = mddev->private; + int i,d; + + for (i = 0; i < conf->copies; i++) + if (r10_bio->devs[i].bio == bio) + break; + d = r10_bio->devs[i].devnum; + if (!uptodate) + md_error(mddev, conf->mirrors[d].rdev); + + update_head_pos(i, r10_bio); + + rdev_dec_pending(conf->mirrors[d].rdev, mddev); while (atomic_dec_and_test(&r10_bio->remaining)) { if (r10_bio->master_bio == NULL) { /* the primary of several recovery bios */ sector_t s = r10_bio->sectors; - if (test_bit(R10BIO_MadeGood, &r10_bio->state) || - test_bit(R10BIO_WriteError, &r10_bio->state)) - reschedule_retry(r10_bio); - else - put_buf(r10_bio); + put_buf(r10_bio); md_done_sync(mddev, s, 1); break; } else { r10bio_t *r10_bio2 = (r10bio_t *)r10_bio->master_bio; - if (test_bit(R10BIO_MadeGood, &r10_bio->state) || - test_bit(R10BIO_WriteError, &r10_bio->state)) - reschedule_retry(r10_bio); - else - put_buf(r10_bio); + put_buf(r10_bio); r10_bio = r10_bio2; } } } -static void end_sync_write(struct bio *bio, int error) -{ - int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - r10bio_t *r10_bio = bio->bi_private; - mddev_t *mddev = r10_bio->mddev; - conf_t *conf = mddev->private; - int d; - sector_t first_bad; - int bad_sectors; - int slot; - - d = find_bio_disk(conf, r10_bio, bio, &slot); - - if (!uptodate) { - set_bit(WriteErrorSeen, &conf->mirrors[d].rdev->flags); - set_bit(R10BIO_WriteError, &r10_bio->state); - } else if (is_badblock(conf->mirrors[d].rdev, - r10_bio->devs[slot].addr, - r10_bio->sectors, - &first_bad, &bad_sectors)) - set_bit(R10BIO_MadeGood, &r10_bio->state); - - rdev_dec_pending(conf->mirrors[d].rdev, mddev); - - end_sync_request(r10_bio); -} - /* * Note: sync and recover and handled very differently for raid10 * This code is for resync. @@ -1551,12 +1299,11 @@ static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio) if (j == vcnt) continue; mddev->resync_mismatches += r10_bio->sectors; - if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) - /* Don't fix anything. */ - continue; } - /* Ok, we need to write this bio, either to correct an - * inconsistency or to correct an unreadable block. + if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) + /* Don't fix anything. */ + continue; + /* Ok, we need to write this bio * First we need to fixup bv_offset, bv_len and * bi_vecs, as the read request might have corrupted these */ @@ -1608,107 +1355,32 @@ static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio) * The second for writing. * */ -static void fix_recovery_read_error(r10bio_t *r10_bio) -{ - /* We got a read error during recovery. - * We repeat the read in smaller page-sized sections. - * If a read succeeds, write it to the new device or record - * a bad block if we cannot. - * If a read fails, record a bad block on both old and - * new devices. - */ - mddev_t *mddev = r10_bio->mddev; - conf_t *conf = mddev->private; - struct bio *bio = r10_bio->devs[0].bio; - sector_t sect = 0; - int sectors = r10_bio->sectors; - int idx = 0; - int dr = r10_bio->devs[0].devnum; - int dw = r10_bio->devs[1].devnum; - - while (sectors) { - int s = sectors; - mdk_rdev_t *rdev; - sector_t addr; - int ok; - - if (s > (PAGE_SIZE>>9)) - s = PAGE_SIZE >> 9; - - rdev = conf->mirrors[dr].rdev; - addr = r10_bio->devs[0].addr + sect, - ok = sync_page_io(rdev, - addr, - s << 9, - bio->bi_io_vec[idx].bv_page, - READ, false); - if (ok) { - rdev = conf->mirrors[dw].rdev; - addr = r10_bio->devs[1].addr + sect; - ok = sync_page_io(rdev, - addr, - s << 9, - bio->bi_io_vec[idx].bv_page, - WRITE, false); - if (!ok) - set_bit(WriteErrorSeen, &rdev->flags); - } - if (!ok) { - /* We don't worry if we cannot set a bad block - - * it really is bad so there is no loss in not - * recording it yet - */ - rdev_set_badblocks(rdev, addr, s, 0); - - if (rdev != conf->mirrors[dw].rdev) { - /* need bad block on destination too */ - mdk_rdev_t *rdev2 = conf->mirrors[dw].rdev; - addr = r10_bio->devs[1].addr + sect; - ok = rdev_set_badblocks(rdev2, addr, s, 0); - if (!ok) { - /* just abort the recovery */ - printk(KERN_NOTICE - "md/raid10:%s: recovery aborted" - " due to read error\n", - mdname(mddev)); - - conf->mirrors[dw].recovery_disabled - = mddev->recovery_disabled; - set_bit(MD_RECOVERY_INTR, - &mddev->recovery); - break; - } - } - } - - sectors -= s; - sect += s; - idx++; - } -} static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio) { conf_t *conf = mddev->private; - int d; - struct bio *wbio; + int i, d; + struct bio *bio, *wbio; - if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) { - fix_recovery_read_error(r10_bio); - end_sync_request(r10_bio); - return; - } - /* - * share the pages with the first bio + /* move the pages across to the second bio * and submit the write request */ + bio = r10_bio->devs[0].bio; wbio = r10_bio->devs[1].bio; + for (i=0; i < wbio->bi_vcnt; i++) { + struct page *p = bio->bi_io_vec[i].bv_page; + bio->bi_io_vec[i].bv_page = wbio->bi_io_vec[i].bv_page; + wbio->bi_io_vec[i].bv_page = p; + } d = r10_bio->devs[1].devnum; atomic_inc(&conf->mirrors[d].rdev->nr_pending); md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9); - generic_make_request(wbio); + if (test_bit(R10BIO_Uptodate, &r10_bio->state)) + generic_make_request(wbio); + else + bio_endio(wbio, -EIO); } @@ -1749,26 +1421,6 @@ static void check_decay_read_errors(mddev_t *mddev, mdk_rdev_t *rdev) atomic_set(&rdev->read_errors, read_errors >> hours_since_last); } -static int r10_sync_page_io(mdk_rdev_t *rdev, sector_t sector, - int sectors, struct page *page, int rw) -{ - sector_t first_bad; - int bad_sectors; - - if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors) - && (rw == READ || test_bit(WriteErrorSeen, &rdev->flags))) - return -1; - if (sync_page_io(rdev, sector, sectors << 9, page, rw, false)) - /* success */ - return 1; - if (rw == WRITE) - set_bit(WriteErrorSeen, &rdev->flags); - /* need to record an error - either for the block or the device */ - if (!rdev_set_badblocks(rdev, sector, sectors, 0)) - md_error(rdev->mddev, rdev); - return 0; -} - /* * This is a kernel thread which: * @@ -1824,15 +1476,10 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) rcu_read_lock(); do { - sector_t first_bad; - int bad_sectors; - d = r10_bio->devs[sl].devnum; rdev = rcu_dereference(conf->mirrors[d].rdev); if (rdev && - test_bit(In_sync, &rdev->flags) && - is_badblock(rdev, r10_bio->devs[sl].addr + sect, s, - &first_bad, &bad_sectors) == 0) { + test_bit(In_sync, &rdev->flags)) { atomic_inc(&rdev->nr_pending); rcu_read_unlock(); success = sync_page_io(rdev, @@ -1852,19 +1499,9 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) rcu_read_unlock(); if (!success) { - /* Cannot read from anywhere, just mark the block - * as bad on the first device to discourage future - * reads. - */ + /* Cannot read from anywhere -- bye bye array */ int dn = r10_bio->devs[r10_bio->read_slot].devnum; - rdev = conf->mirrors[dn].rdev; - - if (!rdev_set_badblocks( - rdev, - r10_bio->devs[r10_bio->read_slot].addr - + sect, - s, 0)) - md_error(mddev, rdev); + md_error(mddev, conf->mirrors[dn].rdev); break; } @@ -1879,82 +1516,80 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) sl--; d = r10_bio->devs[sl].devnum; rdev = rcu_dereference(conf->mirrors[d].rdev); - if (!rdev || - !test_bit(In_sync, &rdev->flags)) - continue; - - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - if (r10_sync_page_io(rdev, - r10_bio->devs[sl].addr + - sect, - s<<9, conf->tmppage, WRITE) - == 0) { - /* Well, this device is dead */ - printk(KERN_NOTICE - "md/raid10:%s: read correction " - "write failed" - " (%d sectors at %llu on %s)\n", - mdname(mddev), s, - (unsigned long long)( - sect + rdev->data_offset), - bdevname(rdev->bdev, b)); - printk(KERN_NOTICE "md/raid10:%s: %s: failing " - "drive\n", - mdname(mddev), - bdevname(rdev->bdev, b)); + if (rdev && + test_bit(In_sync, &rdev->flags)) { + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + atomic_add(s, &rdev->corrected_errors); + if (sync_page_io(rdev, + r10_bio->devs[sl].addr + + sect, + s<<9, conf->tmppage, WRITE, false) + == 0) { + /* Well, this device is dead */ + printk(KERN_NOTICE + "md/raid10:%s: read correction " + "write failed" + " (%d sectors at %llu on %s)\n", + mdname(mddev), s, + (unsigned long long)( + sect + rdev->data_offset), + bdevname(rdev->bdev, b)); + printk(KERN_NOTICE "md/raid10:%s: %s: failing " + "drive\n", + mdname(mddev), + bdevname(rdev->bdev, b)); + md_error(mddev, rdev); + } + rdev_dec_pending(rdev, mddev); + rcu_read_lock(); } - rdev_dec_pending(rdev, mddev); - rcu_read_lock(); } sl = start; while (sl != r10_bio->read_slot) { - char b[BDEVNAME_SIZE]; if (sl==0) sl = conf->copies; sl--; d = r10_bio->devs[sl].devnum; rdev = rcu_dereference(conf->mirrors[d].rdev); - if (!rdev || - !test_bit(In_sync, &rdev->flags)) - continue; + if (rdev && + test_bit(In_sync, &rdev->flags)) { + char b[BDEVNAME_SIZE]; + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + if (sync_page_io(rdev, + r10_bio->devs[sl].addr + + sect, + s<<9, conf->tmppage, + READ, false) == 0) { + /* Well, this device is dead */ + printk(KERN_NOTICE + "md/raid10:%s: unable to read back " + "corrected sectors" + " (%d sectors at %llu on %s)\n", + mdname(mddev), s, + (unsigned long long)( + sect + rdev->data_offset), + bdevname(rdev->bdev, b)); + printk(KERN_NOTICE "md/raid10:%s: %s: failing drive\n", + mdname(mddev), + bdevname(rdev->bdev, b)); + + md_error(mddev, rdev); + } else { + printk(KERN_INFO + "md/raid10:%s: read error corrected" + " (%d sectors at %llu on %s)\n", + mdname(mddev), s, + (unsigned long long)( + sect + rdev->data_offset), + bdevname(rdev->bdev, b)); + } - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - switch (r10_sync_page_io(rdev, - r10_bio->devs[sl].addr + - sect, - s<<9, conf->tmppage, - READ)) { - case 0: - /* Well, this device is dead */ - printk(KERN_NOTICE - "md/raid10:%s: unable to read back " - "corrected sectors" - " (%d sectors at %llu on %s)\n", - mdname(mddev), s, - (unsigned long long)( - sect + rdev->data_offset), - bdevname(rdev->bdev, b)); - printk(KERN_NOTICE "md/raid10:%s: %s: failing " - "drive\n", - mdname(mddev), - bdevname(rdev->bdev, b)); - break; - case 1: - printk(KERN_INFO - "md/raid10:%s: read error corrected" - " (%d sectors at %llu on %s)\n", - mdname(mddev), s, - (unsigned long long)( - sect + rdev->data_offset), - bdevname(rdev->bdev, b)); - atomic_add(s, &rdev->corrected_errors); + rdev_dec_pending(rdev, mddev); + rcu_read_lock(); } - - rdev_dec_pending(rdev, mddev); - rcu_read_lock(); } rcu_read_unlock(); @@ -1963,254 +1598,21 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) } } -static void bi_complete(struct bio *bio, int error) -{ - complete((struct completion *)bio->bi_private); -} - -static int submit_bio_wait(int rw, struct bio *bio) -{ - struct completion event; - rw |= REQ_SYNC; - - init_completion(&event); - bio->bi_private = &event; - bio->bi_end_io = bi_complete; - submit_bio(rw, bio); - wait_for_completion(&event); - - return test_bit(BIO_UPTODATE, &bio->bi_flags); -} - -static int narrow_write_error(r10bio_t *r10_bio, int i) -{ - struct bio *bio = r10_bio->master_bio; - mddev_t *mddev = r10_bio->mddev; - conf_t *conf = mddev->private; - mdk_rdev_t *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev; - /* bio has the data to be written to slot 'i' where - * we just recently had a write error. - * We repeatedly clone the bio and trim down to one block, - * then try the write. Where the write fails we record - * a bad block. - * It is conceivable that the bio doesn't exactly align with - * blocks. We must handle this. - * - * We currently own a reference to the rdev. - */ - - int block_sectors; - sector_t sector; - int sectors; - int sect_to_write = r10_bio->sectors; - int ok = 1; - - if (rdev->badblocks.shift < 0) - return 0; - - block_sectors = 1 << rdev->badblocks.shift; - sector = r10_bio->sector; - sectors = ((r10_bio->sector + block_sectors) - & ~(sector_t)(block_sectors - 1)) - - sector; - - while (sect_to_write) { - struct bio *wbio; - if (sectors > sect_to_write) - sectors = sect_to_write; - /* Write at 'sector' for 'sectors' */ - wbio = bio_clone_mddev(bio, GFP_NOIO, mddev); - md_trim_bio(wbio, sector - bio->bi_sector, sectors); - wbio->bi_sector = (r10_bio->devs[i].addr+ - rdev->data_offset+ - (sector - r10_bio->sector)); - wbio->bi_bdev = rdev->bdev; - if (submit_bio_wait(WRITE, wbio) == 0) - /* Failure! */ - ok = rdev_set_badblocks(rdev, sector, - sectors, 0) - && ok; - - bio_put(wbio); - sect_to_write -= sectors; - sector += sectors; - sectors = block_sectors; - } - return ok; -} - -static void handle_read_error(mddev_t *mddev, r10bio_t *r10_bio) -{ - int slot = r10_bio->read_slot; - int mirror = r10_bio->devs[slot].devnum; - struct bio *bio; - conf_t *conf = mddev->private; - mdk_rdev_t *rdev; - char b[BDEVNAME_SIZE]; - unsigned long do_sync; - int max_sectors; - - /* we got a read error. Maybe the drive is bad. Maybe just - * the block and we can fix it. - * We freeze all other IO, and try reading the block from - * other devices. When we find one, we re-write - * and check it that fixes the read error. - * This is all done synchronously while the array is - * frozen. - */ - if (mddev->ro == 0) { - freeze_array(conf); - fix_read_error(conf, mddev, r10_bio); - unfreeze_array(conf); - } - rdev_dec_pending(conf->mirrors[mirror].rdev, mddev); - - bio = r10_bio->devs[slot].bio; - bdevname(bio->bi_bdev, b); - r10_bio->devs[slot].bio = - mddev->ro ? IO_BLOCKED : NULL; -read_more: - mirror = read_balance(conf, r10_bio, &max_sectors); - if (mirror == -1) { - printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O" - " read error for block %llu\n", - mdname(mddev), b, - (unsigned long long)r10_bio->sector); - raid_end_bio_io(r10_bio); - bio_put(bio); - return; - } - - do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC); - if (bio) - bio_put(bio); - slot = r10_bio->read_slot; - rdev = conf->mirrors[mirror].rdev; - printk_ratelimited( - KERN_ERR - "md/raid10:%s: %s: redirecting" - "sector %llu to another mirror\n", - mdname(mddev), - bdevname(rdev->bdev, b), - (unsigned long long)r10_bio->sector); - bio = bio_clone_mddev(r10_bio->master_bio, - GFP_NOIO, mddev); - md_trim_bio(bio, - r10_bio->sector - bio->bi_sector, - max_sectors); - r10_bio->devs[slot].bio = bio; - bio->bi_sector = r10_bio->devs[slot].addr - + rdev->data_offset; - bio->bi_bdev = rdev->bdev; - bio->bi_rw = READ | do_sync; - bio->bi_private = r10_bio; - bio->bi_end_io = raid10_end_read_request; - if (max_sectors < r10_bio->sectors) { - /* Drat - have to split this up more */ - struct bio *mbio = r10_bio->master_bio; - int sectors_handled = - r10_bio->sector + max_sectors - - mbio->bi_sector; - r10_bio->sectors = max_sectors; - spin_lock_irq(&conf->device_lock); - if (mbio->bi_phys_segments == 0) - mbio->bi_phys_segments = 2; - else - mbio->bi_phys_segments++; - spin_unlock_irq(&conf->device_lock); - generic_make_request(bio); - bio = NULL; - - r10_bio = mempool_alloc(conf->r10bio_pool, - GFP_NOIO); - r10_bio->master_bio = mbio; - r10_bio->sectors = (mbio->bi_size >> 9) - - sectors_handled; - r10_bio->state = 0; - set_bit(R10BIO_ReadError, - &r10_bio->state); - r10_bio->mddev = mddev; - r10_bio->sector = mbio->bi_sector - + sectors_handled; - - goto read_more; - } else - generic_make_request(bio); -} - -static void handle_write_completed(conf_t *conf, r10bio_t *r10_bio) -{ - /* Some sort of write request has finished and it - * succeeded in writing where we thought there was a - * bad block. So forget the bad block. - * Or possibly if failed and we need to record - * a bad block. - */ - int m; - mdk_rdev_t *rdev; - - if (test_bit(R10BIO_IsSync, &r10_bio->state) || - test_bit(R10BIO_IsRecover, &r10_bio->state)) { - for (m = 0; m < conf->copies; m++) { - int dev = r10_bio->devs[m].devnum; - rdev = conf->mirrors[dev].rdev; - if (r10_bio->devs[m].bio == NULL) - continue; - if (test_bit(BIO_UPTODATE, - &r10_bio->devs[m].bio->bi_flags)) { - rdev_clear_badblocks( - rdev, - r10_bio->devs[m].addr, - r10_bio->sectors); - } else { - if (!rdev_set_badblocks( - rdev, - r10_bio->devs[m].addr, - r10_bio->sectors, 0)) - md_error(conf->mddev, rdev); - } - } - put_buf(r10_bio); - } else { - for (m = 0; m < conf->copies; m++) { - int dev = r10_bio->devs[m].devnum; - struct bio *bio = r10_bio->devs[m].bio; - rdev = conf->mirrors[dev].rdev; - if (bio == IO_MADE_GOOD) { - rdev_clear_badblocks( - rdev, - r10_bio->devs[m].addr, - r10_bio->sectors); - rdev_dec_pending(rdev, conf->mddev); - } else if (bio != NULL && - !test_bit(BIO_UPTODATE, &bio->bi_flags)) { - if (!narrow_write_error(r10_bio, m)) { - md_error(conf->mddev, rdev); - set_bit(R10BIO_Degraded, - &r10_bio->state); - } - rdev_dec_pending(rdev, conf->mddev); - } - } - if (test_bit(R10BIO_WriteError, - &r10_bio->state)) - close_write(r10_bio); - raid_end_bio_io(r10_bio); - } -} - static void raid10d(mddev_t *mddev) { r10bio_t *r10_bio; + struct bio *bio; unsigned long flags; conf_t *conf = mddev->private; struct list_head *head = &conf->retry_list; + mdk_rdev_t *rdev; struct blk_plug plug; md_check_recovery(mddev); blk_start_plug(&plug); for (;;) { + char b[BDEVNAME_SIZE]; flush_pending_writes(conf); @@ -2226,26 +1628,64 @@ static void raid10d(mddev_t *mddev) mddev = r10_bio->mddev; conf = mddev->private; - if (test_bit(R10BIO_MadeGood, &r10_bio->state) || - test_bit(R10BIO_WriteError, &r10_bio->state)) - handle_write_completed(conf, r10_bio); - else if (test_bit(R10BIO_IsSync, &r10_bio->state)) + if (test_bit(R10BIO_IsSync, &r10_bio->state)) sync_request_write(mddev, r10_bio); else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) recovery_request_write(mddev, r10_bio); - else if (test_bit(R10BIO_ReadError, &r10_bio->state)) - handle_read_error(mddev, r10_bio); else { - /* just a partial read to be scheduled from a - * separate context - */ int slot = r10_bio->read_slot; - generic_make_request(r10_bio->devs[slot].bio); + int mirror = r10_bio->devs[slot].devnum; + /* we got a read error. Maybe the drive is bad. Maybe just + * the block and we can fix it. + * We freeze all other IO, and try reading the block from + * other devices. When we find one, we re-write + * and check it that fixes the read error. + * This is all done synchronously while the array is + * frozen. + */ + if (mddev->ro == 0) { + freeze_array(conf); + fix_read_error(conf, mddev, r10_bio); + unfreeze_array(conf); + } + rdev_dec_pending(conf->mirrors[mirror].rdev, mddev); + + bio = r10_bio->devs[slot].bio; + r10_bio->devs[slot].bio = + mddev->ro ? IO_BLOCKED : NULL; + mirror = read_balance(conf, r10_bio); + if (mirror == -1) { + printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O" + " read error for block %llu\n", + mdname(mddev), + bdevname(bio->bi_bdev,b), + (unsigned long long)r10_bio->sector); + raid_end_bio_io(r10_bio); + bio_put(bio); + } else { + const unsigned long do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC); + bio_put(bio); + slot = r10_bio->read_slot; + rdev = conf->mirrors[mirror].rdev; + if (printk_ratelimit()) + printk(KERN_ERR "md/raid10:%s: %s: redirecting sector %llu to" + " another mirror\n", + mdname(mddev), + bdevname(rdev->bdev,b), + (unsigned long long)r10_bio->sector); + bio = bio_clone_mddev(r10_bio->master_bio, + GFP_NOIO, mddev); + r10_bio->devs[slot].bio = bio; + bio->bi_sector = r10_bio->devs[slot].addr + + rdev->data_offset; + bio->bi_bdev = rdev->bdev; + bio->bi_rw = READ | do_sync; + bio->bi_private = r10_bio; + bio->bi_end_io = raid10_end_read_request; + generic_make_request(bio); + } } - cond_resched(); - if (mddev->flags & ~(1<recovery)) { /* recovery... the complicated one */ - int j; + int j, k; r10_bio = NULL; for (i=0 ; iraid_disks; i++) { @@ -2395,7 +1836,6 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, r10bio_t *rb2; sector_t sect; int must_sync; - int any_working; if (conf->mirrors[i].rdev == NULL || test_bit(In_sync, &conf->mirrors[i].rdev->flags)) @@ -2447,42 +1887,19 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, must_sync = bitmap_start_sync(mddev->bitmap, sect, &sync_blocks, still_degraded); - any_working = 0; for (j=0; jcopies;j++) { - int k; int d = r10_bio->devs[j].devnum; - sector_t from_addr, to_addr; - mdk_rdev_t *rdev; - sector_t sector, first_bad; - int bad_sectors; if (!conf->mirrors[d].rdev || !test_bit(In_sync, &conf->mirrors[d].rdev->flags)) continue; /* This is where we read from */ - any_working = 1; - rdev = conf->mirrors[d].rdev; - sector = r10_bio->devs[j].addr; - - if (is_badblock(rdev, sector, max_sync, - &first_bad, &bad_sectors)) { - if (first_bad > sector) - max_sync = first_bad - sector; - else { - bad_sectors -= (sector - - first_bad); - if (max_sync > bad_sectors) - max_sync = bad_sectors; - continue; - } - } bio = r10_bio->devs[0].bio; bio->bi_next = biolist; biolist = bio; bio->bi_private = r10_bio; bio->bi_end_io = end_sync_read; bio->bi_rw = READ; - from_addr = r10_bio->devs[j].addr; - bio->bi_sector = from_addr + + bio->bi_sector = r10_bio->devs[j].addr + conf->mirrors[d].rdev->data_offset; bio->bi_bdev = conf->mirrors[d].rdev->bdev; atomic_inc(&conf->mirrors[d].rdev->nr_pending); @@ -2499,48 +1916,26 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, bio->bi_private = r10_bio; bio->bi_end_io = end_sync_write; bio->bi_rw = WRITE; - to_addr = r10_bio->devs[k].addr; - bio->bi_sector = to_addr + + bio->bi_sector = r10_bio->devs[k].addr + conf->mirrors[i].rdev->data_offset; bio->bi_bdev = conf->mirrors[i].rdev->bdev; r10_bio->devs[0].devnum = d; - r10_bio->devs[0].addr = from_addr; r10_bio->devs[1].devnum = i; - r10_bio->devs[1].addr = to_addr; break; } if (j == conf->copies) { - /* Cannot recover, so abort the recovery or - * record a bad block */ + /* Cannot recover, so abort the recovery */ put_buf(r10_bio); if (rb2) atomic_dec(&rb2->remaining); r10_bio = rb2; - if (any_working) { - /* problem is that there are bad blocks - * on other device(s) - */ - int k; - for (k = 0; k < conf->copies; k++) - if (r10_bio->devs[k].devnum == i) - break; - if (!rdev_set_badblocks( - conf->mirrors[i].rdev, - r10_bio->devs[k].addr, - max_sync, 0)) - any_working = 0; - } - if (!any_working) { - if (!test_and_set_bit(MD_RECOVERY_INTR, - &mddev->recovery)) - printk(KERN_INFO "md/raid10:%s: insufficient " - "working devices for recovery.\n", - mdname(mddev)); - conf->mirrors[i].recovery_disabled - = mddev->recovery_disabled; - } + if (!test_and_set_bit(MD_RECOVERY_INTR, + &mddev->recovery)) + printk(KERN_INFO "md/raid10:%s: insufficient " + "working devices for recovery.\n", + mdname(mddev)); break; } } @@ -2584,28 +1979,12 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, for (i=0; icopies; i++) { int d = r10_bio->devs[i].devnum; - sector_t first_bad, sector; - int bad_sectors; - bio = r10_bio->devs[i].bio; bio->bi_end_io = NULL; clear_bit(BIO_UPTODATE, &bio->bi_flags); if (conf->mirrors[d].rdev == NULL || test_bit(Faulty, &conf->mirrors[d].rdev->flags)) continue; - sector = r10_bio->devs[i].addr; - if (is_badblock(conf->mirrors[d].rdev, - sector, max_sync, - &first_bad, &bad_sectors)) { - if (first_bad > sector) - max_sync = first_bad - sector; - else { - bad_sectors -= (sector - first_bad); - if (max_sync > bad_sectors) - max_sync = max_sync; - continue; - } - } atomic_inc(&conf->mirrors[d].rdev->nr_pending); atomic_inc(&r10_bio->remaining); bio->bi_next = biolist; @@ -2613,7 +1992,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, bio->bi_private = r10_bio; bio->bi_end_io = end_sync_read; bio->bi_rw = READ; - bio->bi_sector = sector + + bio->bi_sector = r10_bio->devs[i].addr + conf->mirrors[d].rdev->data_offset; bio->bi_bdev = conf->mirrors[d].rdev->bdev; count++; @@ -2700,8 +2079,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, return sectors_skipped + nr_sectors; giveup: /* There is nowhere to write, so all non-sync - * drives must be failed or in resync, all drives - * have a bad block, so try the next chunk... + * drives must be failed, so try the next chunk... */ if (sector_nr + max_sync < max_sector) max_sector = sector_nr + max_sync; @@ -2871,7 +2249,6 @@ static int run(mddev_t *mddev) (conf->raid_disks / conf->near_copies)); list_for_each_entry(rdev, &mddev->disks, same_set) { - disk_idx = rdev->raid_disk; if (disk_idx >= conf->raid_disks || disk_idx < 0) @@ -2894,7 +2271,7 @@ static int run(mddev_t *mddev) disk->head_position = 0; } /* need to check that every block has at least one working mirror */ - if (!enough(conf, -1)) { + if (!enough(conf)) { printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n", mdname(mddev)); goto out_free_conf; diff --git a/trunk/drivers/md/raid10.h b/trunk/drivers/md/raid10.h index 79cb52a0d4a2..944b1104d3b4 100644 --- a/trunk/drivers/md/raid10.h +++ b/trunk/drivers/md/raid10.h @@ -6,11 +6,6 @@ typedef struct mirror_info mirror_info_t; struct mirror_info { mdk_rdev_t *rdev; sector_t head_position; - int recovery_disabled; /* matches - * mddev->recovery_disabled - * when we shouldn't try - * recovering this device. - */ }; typedef struct r10bio_s r10bio_t; @@ -118,26 +113,10 @@ struct r10bio_s { * level, we store IO_BLOCKED in the appropriate 'bios' pointer */ #define IO_BLOCKED ((struct bio*)1) -/* When we successfully write to a known bad-block, we need to remove the - * bad-block marking which must be done from process context. So we record - * the success by setting devs[n].bio to IO_MADE_GOOD - */ -#define IO_MADE_GOOD ((struct bio *)2) - -#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) /* bits for r10bio.state */ #define R10BIO_Uptodate 0 #define R10BIO_IsSync 1 #define R10BIO_IsRecover 2 #define R10BIO_Degraded 3 -/* Set ReadError on bios that experience a read error - * so that raid10d knows what to do with them. - */ -#define R10BIO_ReadError 4 -/* If a write for this request means we can clear some - * known-bad-block records, we set this flag. - */ -#define R10BIO_MadeGood 5 -#define R10BIO_WriteError 6 #endif diff --git a/trunk/drivers/md/raid5.c b/trunk/drivers/md/raid5.c index dbae459fb02d..b72edf35ec54 100644 --- a/trunk/drivers/md/raid5.c +++ b/trunk/drivers/md/raid5.c @@ -51,7 +51,6 @@ #include #include #include -#include #include "md.h" #include "raid5.h" #include "raid0.h" @@ -97,6 +96,8 @@ #define __inline__ #endif +#define printk_rl(args...) ((void) (printk_ratelimit() && printk(args))) + /* * We maintain a biased count of active stripes in the bottom 16 bits of * bi_phys_segments, and a count of processed stripes in the upper 16 bits @@ -340,7 +341,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) (unsigned long long)sh->sector, i, dev->toread, dev->read, dev->towrite, dev->written, test_bit(R5_LOCKED, &dev->flags)); - WARN_ON(1); + BUG(); } dev->flags = 0; raid5_build_block(sh, i, previous); @@ -526,36 +527,6 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) atomic_inc(&rdev->nr_pending); rcu_read_unlock(); - /* We have already checked bad blocks for reads. Now - * need to check for writes. - */ - while ((rw & WRITE) && rdev && - test_bit(WriteErrorSeen, &rdev->flags)) { - sector_t first_bad; - int bad_sectors; - int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, - &first_bad, &bad_sectors); - if (!bad) - break; - - if (bad < 0) { - set_bit(BlockedBadBlocks, &rdev->flags); - if (!conf->mddev->external && - conf->mddev->flags) { - /* It is very unlikely, but we might - * still need to write out the - * bad block log - better give it - * a chance*/ - md_check_recovery(conf->mddev); - } - md_wait_for_blocked_rdev(rdev, conf->mddev); - } else { - /* Acknowledged bad block - skip the write */ - rdev_dec_pending(rdev, conf->mddev); - rdev = NULL; - } - } - if (rdev) { if (s->syncing || s->expanding || s->expanded) md_sync_acct(rdev->bdev, STRIPE_SECTORS); @@ -577,6 +548,10 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) bi->bi_io_vec[0].bv_offset = 0; bi->bi_size = STRIPE_SIZE; bi->bi_next = NULL; + if ((rw & WRITE) && + test_bit(R5_ReWrite, &sh->dev[i].flags)) + atomic_add(STRIPE_SECTORS, + &rdev->corrected_errors); generic_make_request(bi); } else { if (rw & WRITE) @@ -1045,12 +1020,12 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { struct bio *wbi; - spin_lock_irq(&sh->raid_conf->device_lock); + spin_lock(&sh->lock); chosen = dev->towrite; dev->towrite = NULL; BUG_ON(dev->written); wbi = dev->written = chosen; - spin_unlock_irq(&sh->raid_conf->device_lock); + spin_unlock(&sh->lock); while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) { @@ -1340,11 +1315,12 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) static int grow_one_stripe(raid5_conf_t *conf) { struct stripe_head *sh; - sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL); + sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL); if (!sh) return 0; - + memset(sh, 0, sizeof(*sh) + (conf->pool_size-1)*sizeof(struct r5dev)); sh->raid_conf = conf; + spin_lock_init(&sh->lock); #ifdef CONFIG_MULTICORE_RAID456 init_waitqueue_head(&sh->ops.wait_for_ops); #endif @@ -1459,11 +1435,14 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) return -ENOMEM; for (i = conf->max_nr_stripes; i; i--) { - nsh = kmem_cache_zalloc(sc, GFP_KERNEL); + nsh = kmem_cache_alloc(sc, GFP_KERNEL); if (!nsh) break; + memset(nsh, 0, sizeof(*nsh) + (newsize-1)*sizeof(struct r5dev)); + nsh->raid_conf = conf; + spin_lock_init(&nsh->lock); #ifdef CONFIG_MULTICORE_RAID456 init_waitqueue_head(&nsh->ops.wait_for_ops); #endif @@ -1608,15 +1587,12 @@ static void raid5_end_read_request(struct bio * bi, int error) set_bit(R5_UPTODATE, &sh->dev[i].flags); if (test_bit(R5_ReadError, &sh->dev[i].flags)) { rdev = conf->disks[i].rdev; - printk_ratelimited( - KERN_INFO - "md/raid:%s: read error corrected" - " (%lu sectors at %llu on %s)\n", - mdname(conf->mddev), STRIPE_SECTORS, - (unsigned long long)(sh->sector - + rdev->data_offset), - bdevname(rdev->bdev, b)); - atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); + printk_rl(KERN_INFO "md/raid:%s: read error corrected" + " (%lu sectors at %llu on %s)\n", + mdname(conf->mddev), STRIPE_SECTORS, + (unsigned long long)(sh->sector + + rdev->data_offset), + bdevname(rdev->bdev, b)); clear_bit(R5_ReadError, &sh->dev[i].flags); clear_bit(R5_ReWrite, &sh->dev[i].flags); } @@ -1630,24 +1606,22 @@ static void raid5_end_read_request(struct bio * bi, int error) clear_bit(R5_UPTODATE, &sh->dev[i].flags); atomic_inc(&rdev->read_errors); if (conf->mddev->degraded >= conf->max_degraded) - printk_ratelimited( - KERN_WARNING - "md/raid:%s: read error not correctable " - "(sector %llu on %s).\n", - mdname(conf->mddev), - (unsigned long long)(sh->sector - + rdev->data_offset), - bdn); + printk_rl(KERN_WARNING + "md/raid:%s: read error not correctable " + "(sector %llu on %s).\n", + mdname(conf->mddev), + (unsigned long long)(sh->sector + + rdev->data_offset), + bdn); else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) /* Oh, no!!! */ - printk_ratelimited( - KERN_WARNING - "md/raid:%s: read error NOT corrected!! " - "(sector %llu on %s).\n", - mdname(conf->mddev), - (unsigned long long)(sh->sector - + rdev->data_offset), - bdn); + printk_rl(KERN_WARNING + "md/raid:%s: read error NOT corrected!! " + "(sector %llu on %s).\n", + mdname(conf->mddev), + (unsigned long long)(sh->sector + + rdev->data_offset), + bdn); else if (atomic_read(&rdev->read_errors) > conf->max_nr_stripes) printk(KERN_WARNING @@ -1675,8 +1649,6 @@ static void raid5_end_write_request(struct bio *bi, int error) raid5_conf_t *conf = sh->raid_conf; int disks = sh->disks, i; int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); - sector_t first_bad; - int bad_sectors; for (i=0 ; idev[i].req) @@ -1690,12 +1662,8 @@ static void raid5_end_write_request(struct bio *bi, int error) return; } - if (!uptodate) { - set_bit(WriteErrorSeen, &conf->disks[i].rdev->flags); - set_bit(R5_WriteError, &sh->dev[i].flags); - } else if (is_badblock(conf->disks[i].rdev, sh->sector, STRIPE_SECTORS, - &first_bad, &bad_sectors)) - set_bit(R5_MadeGood, &sh->dev[i].flags); + if (!uptodate) + md_error(conf->mddev, conf->disks[i].rdev); rdev_dec_pending(conf->disks[i].rdev, conf->mddev); @@ -1742,7 +1710,6 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) */ set_bit(MD_RECOVERY_INTR, &mddev->recovery); } - set_bit(Blocked, &rdev->flags); set_bit(Faulty, &rdev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags); printk(KERN_ALERT @@ -1793,7 +1760,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, /* * Select the parity disk based on the user selected algorithm. */ - pd_idx = qd_idx = -1; + pd_idx = qd_idx = ~0; switch(conf->level) { case 4: pd_idx = data_disks; @@ -2176,11 +2143,12 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in raid5_conf_t *conf = sh->raid_conf; int firstwrite=0; - pr_debug("adding bi b#%llu to stripe s#%llu\n", + pr_debug("adding bh b#%llu to stripe s#%llu\n", (unsigned long long)bi->bi_sector, (unsigned long long)sh->sector); + spin_lock(&sh->lock); spin_lock_irq(&conf->device_lock); if (forwrite) { bip = &sh->dev[dd_idx].towrite; @@ -2201,6 +2169,19 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in bi->bi_next = *bip; *bip = bi; bi->bi_phys_segments++; + spin_unlock_irq(&conf->device_lock); + spin_unlock(&sh->lock); + + pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", + (unsigned long long)bi->bi_sector, + (unsigned long long)sh->sector, dd_idx); + + if (conf->mddev->bitmap && firstwrite) { + bitmap_startwrite(conf->mddev->bitmap, sh->sector, + STRIPE_SECTORS, 0); + sh->bm_seq = conf->seq_flush+1; + set_bit(STRIPE_BIT_DELAY, &sh->state); + } if (forwrite) { /* check if page is covered */ @@ -2215,23 +2196,12 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); } - spin_unlock_irq(&conf->device_lock); - - pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", - (unsigned long long)(*bip)->bi_sector, - (unsigned long long)sh->sector, dd_idx); - - if (conf->mddev->bitmap && firstwrite) { - bitmap_startwrite(conf->mddev->bitmap, sh->sector, - STRIPE_SECTORS, 0); - sh->bm_seq = conf->seq_flush+1; - set_bit(STRIPE_BIT_DELAY, &sh->state); - } return 1; overlap: set_bit(R5_Overlap, &sh->dev[dd_idx].flags); spin_unlock_irq(&conf->device_lock); + spin_unlock(&sh->lock); return 0; } @@ -2268,18 +2238,9 @@ handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh, rcu_read_lock(); rdev = rcu_dereference(conf->disks[i].rdev); if (rdev && test_bit(In_sync, &rdev->flags)) - atomic_inc(&rdev->nr_pending); - else - rdev = NULL; + /* multiple read failures in one stripe */ + md_error(conf->mddev, rdev); rcu_read_unlock(); - if (rdev) { - if (!rdev_set_badblocks( - rdev, - sh->sector, - STRIPE_SECTORS, 0)) - md_error(conf->mddev, rdev); - rdev_dec_pending(rdev, conf->mddev); - } } spin_lock_irq(&conf->device_lock); /* fail all writes first */ @@ -2347,10 +2308,6 @@ handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh, if (bitmap_end) bitmap_endwrite(conf->mddev->bitmap, sh->sector, STRIPE_SECTORS, 0, 0); - /* If we were in the middle of a write the parity block might - * still be locked - so just clear all R5_LOCKED flags - */ - clear_bit(R5_LOCKED, &sh->dev[i].flags); } if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) @@ -2358,73 +2315,109 @@ handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh, md_wakeup_thread(conf->mddev->thread); } -static void -handle_failed_sync(raid5_conf_t *conf, struct stripe_head *sh, - struct stripe_head_state *s) +/* fetch_block5 - checks the given member device to see if its data needs + * to be read or computed to satisfy a request. + * + * Returns 1 when no more member devices need to be checked, otherwise returns + * 0 to tell the loop in handle_stripe_fill5 to continue + */ +static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s, + int disk_idx, int disks) +{ + struct r5dev *dev = &sh->dev[disk_idx]; + struct r5dev *failed_dev = &sh->dev[s->failed_num]; + + /* is the data in this block needed, and can we get it? */ + if (!test_bit(R5_LOCKED, &dev->flags) && + !test_bit(R5_UPTODATE, &dev->flags) && + (dev->toread || + (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || + s->syncing || s->expanding || + (s->failed && + (failed_dev->toread || + (failed_dev->towrite && + !test_bit(R5_OVERWRITE, &failed_dev->flags)))))) { + /* We would like to get this block, possibly by computing it, + * otherwise read it if the backing disk is insync + */ + if ((s->uptodate == disks - 1) && + (s->failed && disk_idx == s->failed_num)) { + set_bit(STRIPE_COMPUTE_RUN, &sh->state); + set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); + set_bit(R5_Wantcompute, &dev->flags); + sh->ops.target = disk_idx; + sh->ops.target2 = -1; + s->req_compute = 1; + /* Careful: from this point on 'uptodate' is in the eye + * of raid_run_ops which services 'compute' operations + * before writes. R5_Wantcompute flags a block that will + * be R5_UPTODATE by the time it is needed for a + * subsequent operation. + */ + s->uptodate++; + return 1; /* uptodate + compute == disks */ + } else if (test_bit(R5_Insync, &dev->flags)) { + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantread, &dev->flags); + s->locked++; + pr_debug("Reading block %d (sync=%d)\n", disk_idx, + s->syncing); + } + } + + return 0; +} + +/** + * handle_stripe_fill5 - read or compute data to satisfy pending requests. + */ +static void handle_stripe_fill5(struct stripe_head *sh, + struct stripe_head_state *s, int disks) { - int abort = 0; int i; - md_done_sync(conf->mddev, STRIPE_SECTORS, 0); - clear_bit(STRIPE_SYNCING, &sh->state); - s->syncing = 0; - /* There is nothing more to do for sync/check/repair. - * For recover we need to record a bad block on all - * non-sync devices, or abort the recovery - */ - if (!test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) - return; - /* During recovery devices cannot be removed, so locking and - * refcounting of rdevs is not needed + /* look for blocks to read/compute, skip this if a compute + * is already in flight, or if the stripe contents are in the + * midst of changing due to a write */ - for (i = 0; i < conf->raid_disks; i++) { - mdk_rdev_t *rdev = conf->disks[i].rdev; - if (!rdev - || test_bit(Faulty, &rdev->flags) - || test_bit(In_sync, &rdev->flags)) - continue; - if (!rdev_set_badblocks(rdev, sh->sector, - STRIPE_SECTORS, 0)) - abort = 1; - } - if (abort) { - conf->recovery_disabled = conf->mddev->recovery_disabled; - set_bit(MD_RECOVERY_INTR, &conf->mddev->recovery); - } + if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && + !sh->reconstruct_state) + for (i = disks; i--; ) + if (fetch_block5(sh, s, i, disks)) + break; + set_bit(STRIPE_HANDLE, &sh->state); } -/* fetch_block - checks the given member device to see if its data needs +/* fetch_block6 - checks the given member device to see if its data needs * to be read or computed to satisfy a request. * * Returns 1 when no more member devices need to be checked, otherwise returns - * 0 to tell the loop in handle_stripe_fill to continue + * 0 to tell the loop in handle_stripe_fill6 to continue */ -static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, - int disk_idx, int disks) +static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s, + struct r6_state *r6s, int disk_idx, int disks) { struct r5dev *dev = &sh->dev[disk_idx]; - struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]], - &sh->dev[s->failed_num[1]] }; + struct r5dev *fdev[2] = { &sh->dev[r6s->failed_num[0]], + &sh->dev[r6s->failed_num[1]] }; - /* is the data in this block needed, and can we get it? */ if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && (dev->toread || (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || s->syncing || s->expanding || - (s->failed >= 1 && fdev[0]->toread) || - (s->failed >= 2 && fdev[1]->toread) || - (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite && - !test_bit(R5_OVERWRITE, &fdev[0]->flags)) || - (sh->raid_conf->level == 6 && s->failed && s->to_write))) { + (s->failed >= 1 && + (fdev[0]->toread || s->to_write)) || + (s->failed >= 2 && + (fdev[1]->toread || s->to_write)))) { /* we would like to get this block, possibly by computing it, * otherwise read it if the backing disk is insync */ BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); BUG_ON(test_bit(R5_Wantread, &dev->flags)); if ((s->uptodate == disks - 1) && - (s->failed && (disk_idx == s->failed_num[0] || - disk_idx == s->failed_num[1]))) { + (s->failed && (disk_idx == r6s->failed_num[0] || + disk_idx == r6s->failed_num[1]))) { /* have disk failed, and we're requested to fetch it; * do compute it */ @@ -2436,12 +2429,6 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, sh->ops.target = disk_idx; sh->ops.target2 = -1; /* no 2nd target */ s->req_compute = 1; - /* Careful: from this point on 'uptodate' is in the eye - * of raid_run_ops which services 'compute' operations - * before writes. R5_Wantcompute flags a block that will - * be R5_UPTODATE by the time it is needed for a - * subsequent operation. - */ s->uptodate++; return 1; } else if (s->uptodate == disks-2 && s->failed >= 2) { @@ -2482,11 +2469,11 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, } /** - * handle_stripe_fill - read or compute data to satisfy pending requests. + * handle_stripe_fill6 - read or compute data to satisfy pending requests. */ -static void handle_stripe_fill(struct stripe_head *sh, - struct stripe_head_state *s, - int disks) +static void handle_stripe_fill6(struct stripe_head *sh, + struct stripe_head_state *s, struct r6_state *r6s, + int disks) { int i; @@ -2497,7 +2484,7 @@ static void handle_stripe_fill(struct stripe_head *sh, if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && !sh->reconstruct_state) for (i = disks; i--; ) - if (fetch_block(sh, s, i, disks)) + if (fetch_block6(sh, s, r6s, i, disks)) break; set_bit(STRIPE_HANDLE, &sh->state); } @@ -2553,19 +2540,11 @@ static void handle_stripe_clean_event(raid5_conf_t *conf, md_wakeup_thread(conf->mddev->thread); } -static void handle_stripe_dirtying(raid5_conf_t *conf, - struct stripe_head *sh, - struct stripe_head_state *s, - int disks) +static void handle_stripe_dirtying5(raid5_conf_t *conf, + struct stripe_head *sh, struct stripe_head_state *s, int disks) { int rmw = 0, rcw = 0, i; - if (conf->max_degraded == 2) { - /* RAID6 requires 'rcw' in current implementation - * Calculate the real rcw later - for now fake it - * look like rcw is cheaper - */ - rcw = 1; rmw = 2; - } else for (i = disks; i--; ) { + for (i = disks; i--; ) { /* would I have to read this buffer for read_modify_write */ struct r5dev *dev = &sh->dev[i]; if ((dev->towrite || i == sh->pd_idx) && @@ -2612,19 +2591,16 @@ static void handle_stripe_dirtying(raid5_conf_t *conf, } } } - if (rcw <= rmw && rcw > 0) { + if (rcw <= rmw && rcw > 0) /* want reconstruct write, but need to get some data */ - rcw = 0; for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; if (!test_bit(R5_OVERWRITE, &dev->flags) && - i != sh->pd_idx && i != sh->qd_idx && + i != sh->pd_idx && !test_bit(R5_LOCKED, &dev->flags) && !(test_bit(R5_UPTODATE, &dev->flags) || - test_bit(R5_Wantcompute, &dev->flags))) { - rcw++; - if (!test_bit(R5_Insync, &dev->flags)) - continue; /* it's a failed drive */ + test_bit(R5_Wantcompute, &dev->flags)) && + test_bit(R5_Insync, &dev->flags)) { if ( test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { pr_debug("Read_old block " @@ -2638,7 +2614,6 @@ static void handle_stripe_dirtying(raid5_conf_t *conf, } } } - } /* now if nothing is locked, and if we have enough data, * we can start a write request */ @@ -2655,6 +2630,53 @@ static void handle_stripe_dirtying(raid5_conf_t *conf, schedule_reconstruction(sh, s, rcw == 0, 0); } +static void handle_stripe_dirtying6(raid5_conf_t *conf, + struct stripe_head *sh, struct stripe_head_state *s, + struct r6_state *r6s, int disks) +{ + int rcw = 0, pd_idx = sh->pd_idx, i; + int qd_idx = sh->qd_idx; + + set_bit(STRIPE_HANDLE, &sh->state); + for (i = disks; i--; ) { + struct r5dev *dev = &sh->dev[i]; + /* check if we haven't enough data */ + if (!test_bit(R5_OVERWRITE, &dev->flags) && + i != pd_idx && i != qd_idx && + !test_bit(R5_LOCKED, &dev->flags) && + !(test_bit(R5_UPTODATE, &dev->flags) || + test_bit(R5_Wantcompute, &dev->flags))) { + rcw++; + if (!test_bit(R5_Insync, &dev->flags)) + continue; /* it's a failed drive */ + + if ( + test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { + pr_debug("Read_old stripe %llu " + "block %d for Reconstruct\n", + (unsigned long long)sh->sector, i); + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantread, &dev->flags); + s->locked++; + } else { + pr_debug("Request delayed stripe %llu " + "block %d for Reconstruct\n", + (unsigned long long)sh->sector, i); + set_bit(STRIPE_DELAYED, &sh->state); + set_bit(STRIPE_HANDLE, &sh->state); + } + } + } + /* now if nothing is locked, and if we have enough data, we can start a + * write request + */ + if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && + s->locked == 0 && rcw == 0 && + !test_bit(STRIPE_BIT_DELAY, &sh->state)) { + schedule_reconstruction(sh, s, 1, 0); + } +} + static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, struct stripe_head_state *s, int disks) { @@ -2673,7 +2695,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, s->uptodate--; break; } - dev = &sh->dev[s->failed_num[0]]; + dev = &sh->dev[s->failed_num]; /* fall through */ case check_state_compute_result: sh->check_state = check_state_idle; @@ -2745,7 +2767,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, struct stripe_head_state *s, - int disks) + struct r6_state *r6s, int disks) { int pd_idx = sh->pd_idx; int qd_idx = sh->qd_idx; @@ -2764,14 +2786,14 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, switch (sh->check_state) { case check_state_idle: /* start a new check operation if there are < 2 failures */ - if (s->failed == s->q_failed) { + if (s->failed == r6s->q_failed) { /* The only possible failed device holds Q, so it * makes sense to check P (If anything else were failed, * we would have used P to recreate it). */ sh->check_state = check_state_run; } - if (!s->q_failed && s->failed < 2) { + if (!r6s->q_failed && s->failed < 2) { /* Q is not failed, and we didn't use it to generate * anything, so it makes sense to check it */ @@ -2813,13 +2835,13 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, */ BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */ if (s->failed == 2) { - dev = &sh->dev[s->failed_num[1]]; + dev = &sh->dev[r6s->failed_num[1]]; s->locked++; set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantwrite, &dev->flags); } if (s->failed >= 1) { - dev = &sh->dev[s->failed_num[0]]; + dev = &sh->dev[r6s->failed_num[0]]; s->locked++; set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantwrite, &dev->flags); @@ -2906,7 +2928,8 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, } } -static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh) +static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, + struct r6_state *r6s) { int i; @@ -2948,7 +2971,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh) set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); for (j = 0; j < conf->raid_disks; j++) if (j != sh2->pd_idx && - j != sh2->qd_idx && + (!r6s || j != sh2->qd_idx) && !test_bit(R5_Expanded, &sh2->dev[j].flags)) break; if (j == conf->raid_disks) { @@ -2983,35 +3006,43 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh) * */ -static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) +static void handle_stripe5(struct stripe_head *sh) { raid5_conf_t *conf = sh->raid_conf; - int disks = sh->disks; + int disks = sh->disks, i; + struct bio *return_bi = NULL; + struct stripe_head_state s; struct r5dev *dev; - int i; + mdk_rdev_t *blocked_rdev = NULL; + int prexor; + int dec_preread_active = 0; - memset(s, 0, sizeof(*s)); + memset(&s, 0, sizeof(s)); + pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d " + "reconstruct:%d\n", (unsigned long long)sh->sector, sh->state, + atomic_read(&sh->count), sh->pd_idx, sh->check_state, + sh->reconstruct_state); - s->syncing = test_bit(STRIPE_SYNCING, &sh->state); - s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); - s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); - s->failed_num[0] = -1; - s->failed_num[1] = -1; + spin_lock(&sh->lock); + clear_bit(STRIPE_HANDLE, &sh->state); + clear_bit(STRIPE_DELAYED, &sh->state); + + s.syncing = test_bit(STRIPE_SYNCING, &sh->state); + s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); + s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); /* Now to look around and see what can be done */ rcu_read_lock(); - spin_lock_irq(&conf->device_lock); for (i=disks; i--; ) { mdk_rdev_t *rdev; - sector_t first_bad; - int bad_sectors; - int is_bad = 0; dev = &sh->dev[i]; - pr_debug("check %d: state 0x%lx read %p write %p written %p\n", - i, dev->flags, dev->toread, dev->towrite, dev->written); - /* maybe we can reply to a read + pr_debug("check %d: state 0x%lx toread %p read %p write %p " + "written %p\n", i, dev->flags, dev->toread, dev->read, + dev->towrite, dev->written); + + /* maybe we can request a biofill operation * * new wantfill requests are only permitted while * ops_complete_biofill is guaranteed to be inactive @@ -3021,74 +3052,37 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) set_bit(R5_Wantfill, &dev->flags); /* now count some things */ - if (test_bit(R5_LOCKED, &dev->flags)) - s->locked++; - if (test_bit(R5_UPTODATE, &dev->flags)) - s->uptodate++; - if (test_bit(R5_Wantcompute, &dev->flags)) { - s->compute++; - BUG_ON(s->compute > 2); - } + if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; + if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++; + if (test_bit(R5_Wantcompute, &dev->flags)) s.compute++; if (test_bit(R5_Wantfill, &dev->flags)) - s->to_fill++; + s.to_fill++; else if (dev->toread) - s->to_read++; + s.to_read++; if (dev->towrite) { - s->to_write++; + s.to_write++; if (!test_bit(R5_OVERWRITE, &dev->flags)) - s->non_overwrite++; + s.non_overwrite++; } if (dev->written) - s->written++; + s.written++; rdev = rcu_dereference(conf->disks[i].rdev); - if (rdev) { - is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, - &first_bad, &bad_sectors); - if (s->blocked_rdev == NULL - && (test_bit(Blocked, &rdev->flags) - || is_bad < 0)) { - if (is_bad < 0) - set_bit(BlockedBadBlocks, - &rdev->flags); - s->blocked_rdev = rdev; - atomic_inc(&rdev->nr_pending); - } + if (blocked_rdev == NULL && + rdev && unlikely(test_bit(Blocked, &rdev->flags))) { + blocked_rdev = rdev; + atomic_inc(&rdev->nr_pending); } clear_bit(R5_Insync, &dev->flags); if (!rdev) /* Not in-sync */; - else if (is_bad) { - /* also not in-sync */ - if (!test_bit(WriteErrorSeen, &rdev->flags)) { - /* treat as in-sync, but with a read error - * which we can now try to correct - */ - set_bit(R5_Insync, &dev->flags); - set_bit(R5_ReadError, &dev->flags); - } - } else if (test_bit(In_sync, &rdev->flags)) + else if (test_bit(In_sync, &rdev->flags)) set_bit(R5_Insync, &dev->flags); else { - /* in sync if before recovery_offset */ + /* could be in-sync depending on recovery/reshape status */ if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) set_bit(R5_Insync, &dev->flags); } - if (test_bit(R5_WriteError, &dev->flags)) { - clear_bit(R5_Insync, &dev->flags); - if (!test_bit(Faulty, &rdev->flags)) { - s->handle_bad_blocks = 1; - atomic_inc(&rdev->nr_pending); - } else - clear_bit(R5_WriteError, &dev->flags); - } - if (test_bit(R5_MadeGood, &dev->flags)) { - if (!test_bit(Faulty, &rdev->flags)) { - s->handle_bad_blocks = 1; - atomic_inc(&rdev->nr_pending); - } else - clear_bit(R5_MadeGood, &dev->flags); - } if (!test_bit(R5_Insync, &dev->flags)) { /* The ReadError flag will just be confusing now */ clear_bit(R5_ReadError, &dev->flags); @@ -3097,60 +3091,313 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) if (test_bit(R5_ReadError, &dev->flags)) clear_bit(R5_Insync, &dev->flags); if (!test_bit(R5_Insync, &dev->flags)) { - if (s->failed < 2) - s->failed_num[s->failed] = i; - s->failed++; + s.failed++; + s.failed_num = i; } } - spin_unlock_irq(&conf->device_lock); rcu_read_unlock(); -} -static void handle_stripe(struct stripe_head *sh) -{ - struct stripe_head_state s; - raid5_conf_t *conf = sh->raid_conf; - int i; - int prexor; - int disks = sh->disks; - struct r5dev *pdev, *qdev; + if (unlikely(blocked_rdev)) { + if (s.syncing || s.expanding || s.expanded || + s.to_write || s.written) { + set_bit(STRIPE_HANDLE, &sh->state); + goto unlock; + } + /* There is nothing for the blocked_rdev to block */ + rdev_dec_pending(blocked_rdev, conf->mddev); + blocked_rdev = NULL; + } - clear_bit(STRIPE_HANDLE, &sh->state); - if (test_and_set_bit(STRIPE_ACTIVE, &sh->state)) { - /* already being handled, ensure it gets handled - * again when current action finishes */ - set_bit(STRIPE_HANDLE, &sh->state); - return; + if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { + set_bit(STRIPE_OP_BIOFILL, &s.ops_request); + set_bit(STRIPE_BIOFILL_RUN, &sh->state); } - if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { - set_bit(STRIPE_SYNCING, &sh->state); - clear_bit(STRIPE_INSYNC, &sh->state); + pr_debug("locked=%d uptodate=%d to_read=%d" + " to_write=%d failed=%d failed_num=%d\n", + s.locked, s.uptodate, s.to_read, s.to_write, + s.failed, s.failed_num); + /* check if the array has lost two devices and, if so, some requests might + * need to be failed + */ + if (s.failed > 1 && s.to_read+s.to_write+s.written) + handle_failed_stripe(conf, sh, &s, disks, &return_bi); + if (s.failed > 1 && s.syncing) { + md_done_sync(conf->mddev, STRIPE_SECTORS,0); + clear_bit(STRIPE_SYNCING, &sh->state); + s.syncing = 0; } - clear_bit(STRIPE_DELAYED, &sh->state); + + /* might be able to return some write requests if the parity block + * is safe, or on a failed drive + */ + dev = &sh->dev[sh->pd_idx]; + if ( s.written && + ((test_bit(R5_Insync, &dev->flags) && + !test_bit(R5_LOCKED, &dev->flags) && + test_bit(R5_UPTODATE, &dev->flags)) || + (s.failed == 1 && s.failed_num == sh->pd_idx))) + handle_stripe_clean_event(conf, sh, disks, &return_bi); + + /* Now we might consider reading some blocks, either to check/generate + * parity, or to satisfy requests + * or to load a block that is being partially written. + */ + if (s.to_read || s.non_overwrite || + (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) + handle_stripe_fill5(sh, &s, disks); + + /* Now we check to see if any write operations have recently + * completed + */ + prexor = 0; + if (sh->reconstruct_state == reconstruct_state_prexor_drain_result) + prexor = 1; + if (sh->reconstruct_state == reconstruct_state_drain_result || + sh->reconstruct_state == reconstruct_state_prexor_drain_result) { + sh->reconstruct_state = reconstruct_state_idle; + + /* All the 'written' buffers and the parity block are ready to + * be written back to disk + */ + BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags)); + for (i = disks; i--; ) { + dev = &sh->dev[i]; + if (test_bit(R5_LOCKED, &dev->flags) && + (i == sh->pd_idx || dev->written)) { + pr_debug("Writing block %d\n", i); + set_bit(R5_Wantwrite, &dev->flags); + if (prexor) + continue; + if (!test_bit(R5_Insync, &dev->flags) || + (i == sh->pd_idx && s.failed == 0)) + set_bit(STRIPE_INSYNC, &sh->state); + } + } + if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + dec_preread_active = 1; + } + + /* Now to consider new write requests and what else, if anything + * should be read. We do not handle new writes when: + * 1/ A 'write' operation (copy+xor) is already in flight. + * 2/ A 'check' operation is in flight, as it may clobber the parity + * block. + */ + if (s.to_write && !sh->reconstruct_state && !sh->check_state) + handle_stripe_dirtying5(conf, sh, &s, disks); + + /* maybe we need to check and possibly fix the parity for this stripe + * Any reads will already have been scheduled, so we just see if enough + * data is available. The parity check is held off while parity + * dependent operations are in flight. + */ + if (sh->check_state || + (s.syncing && s.locked == 0 && + !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && + !test_bit(STRIPE_INSYNC, &sh->state))) + handle_parity_checks5(conf, sh, &s, disks); + + if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { + md_done_sync(conf->mddev, STRIPE_SECTORS,1); + clear_bit(STRIPE_SYNCING, &sh->state); + } + + /* If the failed drive is just a ReadError, then we might need to progress + * the repair/check process + */ + if (s.failed == 1 && !conf->mddev->ro && + test_bit(R5_ReadError, &sh->dev[s.failed_num].flags) + && !test_bit(R5_LOCKED, &sh->dev[s.failed_num].flags) + && test_bit(R5_UPTODATE, &sh->dev[s.failed_num].flags) + ) { + dev = &sh->dev[s.failed_num]; + if (!test_bit(R5_ReWrite, &dev->flags)) { + set_bit(R5_Wantwrite, &dev->flags); + set_bit(R5_ReWrite, &dev->flags); + set_bit(R5_LOCKED, &dev->flags); + s.locked++; + } else { + /* let's read it back */ + set_bit(R5_Wantread, &dev->flags); + set_bit(R5_LOCKED, &dev->flags); + s.locked++; + } + } + + /* Finish reconstruct operations initiated by the expansion process */ + if (sh->reconstruct_state == reconstruct_state_result) { + struct stripe_head *sh2 + = get_active_stripe(conf, sh->sector, 1, 1, 1); + if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) { + /* sh cannot be written until sh2 has been read. + * so arrange for sh to be delayed a little + */ + set_bit(STRIPE_DELAYED, &sh->state); + set_bit(STRIPE_HANDLE, &sh->state); + if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, + &sh2->state)) + atomic_inc(&conf->preread_active_stripes); + release_stripe(sh2); + goto unlock; + } + if (sh2) + release_stripe(sh2); + + sh->reconstruct_state = reconstruct_state_idle; + clear_bit(STRIPE_EXPANDING, &sh->state); + for (i = conf->raid_disks; i--; ) { + set_bit(R5_Wantwrite, &sh->dev[i].flags); + set_bit(R5_LOCKED, &sh->dev[i].flags); + s.locked++; + } + } + + if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && + !sh->reconstruct_state) { + /* Need to write out all blocks after computing parity */ + sh->disks = conf->raid_disks; + stripe_set_idx(sh->sector, conf, 0, sh); + schedule_reconstruction(sh, &s, 1, 1); + } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { + clear_bit(STRIPE_EXPAND_READY, &sh->state); + atomic_dec(&conf->reshape_stripes); + wake_up(&conf->wait_for_overlap); + md_done_sync(conf->mddev, STRIPE_SECTORS, 1); + } + + if (s.expanding && s.locked == 0 && + !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) + handle_stripe_expansion(conf, sh, NULL); + + unlock: + spin_unlock(&sh->lock); + + /* wait for this device to become unblocked */ + if (unlikely(blocked_rdev)) + md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); + + if (s.ops_request) + raid_run_ops(sh, s.ops_request); + + ops_run_io(sh, &s); + + if (dec_preread_active) { + /* We delay this until after ops_run_io so that if make_request + * is waiting on a flush, it won't continue until the writes + * have actually been submitted. + */ + atomic_dec(&conf->preread_active_stripes); + if (atomic_read(&conf->preread_active_stripes) < + IO_THRESHOLD) + md_wakeup_thread(conf->mddev->thread); + } + return_io(return_bi); +} + +static void handle_stripe6(struct stripe_head *sh) +{ + raid5_conf_t *conf = sh->raid_conf; + int disks = sh->disks; + struct bio *return_bi = NULL; + int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx; + struct stripe_head_state s; + struct r6_state r6s; + struct r5dev *dev, *pdev, *qdev; + mdk_rdev_t *blocked_rdev = NULL; + int dec_preread_active = 0; pr_debug("handling stripe %llu, state=%#lx cnt=%d, " "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", (unsigned long long)sh->sector, sh->state, - atomic_read(&sh->count), sh->pd_idx, sh->qd_idx, + atomic_read(&sh->count), pd_idx, qd_idx, sh->check_state, sh->reconstruct_state); + memset(&s, 0, sizeof(s)); - analyse_stripe(sh, &s); + spin_lock(&sh->lock); + clear_bit(STRIPE_HANDLE, &sh->state); + clear_bit(STRIPE_DELAYED, &sh->state); - if (s.handle_bad_blocks) { - set_bit(STRIPE_HANDLE, &sh->state); - goto finish; + s.syncing = test_bit(STRIPE_SYNCING, &sh->state); + s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); + s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); + /* Now to look around and see what can be done */ + + rcu_read_lock(); + for (i=disks; i--; ) { + mdk_rdev_t *rdev; + dev = &sh->dev[i]; + + pr_debug("check %d: state 0x%lx read %p write %p written %p\n", + i, dev->flags, dev->toread, dev->towrite, dev->written); + /* maybe we can reply to a read + * + * new wantfill requests are only permitted while + * ops_complete_biofill is guaranteed to be inactive + */ + if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && + !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) + set_bit(R5_Wantfill, &dev->flags); + + /* now count some things */ + if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; + if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++; + if (test_bit(R5_Wantcompute, &dev->flags)) { + s.compute++; + BUG_ON(s.compute > 2); + } + + if (test_bit(R5_Wantfill, &dev->flags)) { + s.to_fill++; + } else if (dev->toread) + s.to_read++; + if (dev->towrite) { + s.to_write++; + if (!test_bit(R5_OVERWRITE, &dev->flags)) + s.non_overwrite++; + } + if (dev->written) + s.written++; + rdev = rcu_dereference(conf->disks[i].rdev); + if (blocked_rdev == NULL && + rdev && unlikely(test_bit(Blocked, &rdev->flags))) { + blocked_rdev = rdev; + atomic_inc(&rdev->nr_pending); + } + clear_bit(R5_Insync, &dev->flags); + if (!rdev) + /* Not in-sync */; + else if (test_bit(In_sync, &rdev->flags)) + set_bit(R5_Insync, &dev->flags); + else { + /* in sync if before recovery_offset */ + if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) + set_bit(R5_Insync, &dev->flags); + } + if (!test_bit(R5_Insync, &dev->flags)) { + /* The ReadError flag will just be confusing now */ + clear_bit(R5_ReadError, &dev->flags); + clear_bit(R5_ReWrite, &dev->flags); + } + if (test_bit(R5_ReadError, &dev->flags)) + clear_bit(R5_Insync, &dev->flags); + if (!test_bit(R5_Insync, &dev->flags)) { + if (s.failed < 2) + r6s.failed_num[s.failed] = i; + s.failed++; + } } + rcu_read_unlock(); - if (unlikely(s.blocked_rdev)) { + if (unlikely(blocked_rdev)) { if (s.syncing || s.expanding || s.expanded || s.to_write || s.written) { set_bit(STRIPE_HANDLE, &sh->state); - goto finish; + goto unlock; } /* There is nothing for the blocked_rdev to block */ - rdev_dec_pending(s.blocked_rdev, conf->mddev); - s.blocked_rdev = NULL; + rdev_dec_pending(blocked_rdev, conf->mddev); + blocked_rdev = NULL; } if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { @@ -3161,88 +3408,83 @@ static void handle_stripe(struct stripe_head *sh) pr_debug("locked=%d uptodate=%d to_read=%d" " to_write=%d failed=%d failed_num=%d,%d\n", s.locked, s.uptodate, s.to_read, s.to_write, s.failed, - s.failed_num[0], s.failed_num[1]); - /* check if the array has lost more than max_degraded devices and, - * if so, some requests might need to be failed. + r6s.failed_num[0], r6s.failed_num[1]); + /* check if the array has lost >2 devices and, if so, some requests + * might need to be failed */ - if (s.failed > conf->max_degraded && s.to_read+s.to_write+s.written) - handle_failed_stripe(conf, sh, &s, disks, &s.return_bi); - if (s.failed > conf->max_degraded && s.syncing) - handle_failed_sync(conf, sh, &s); + if (s.failed > 2 && s.to_read+s.to_write+s.written) + handle_failed_stripe(conf, sh, &s, disks, &return_bi); + if (s.failed > 2 && s.syncing) { + md_done_sync(conf->mddev, STRIPE_SECTORS,0); + clear_bit(STRIPE_SYNCING, &sh->state); + s.syncing = 0; + } /* * might be able to return some write requests if the parity blocks * are safe, or on a failed drive */ - pdev = &sh->dev[sh->pd_idx]; - s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx) - || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx); - qdev = &sh->dev[sh->qd_idx]; - s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx) - || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx) - || conf->level < 6; - - if (s.written && - (s.p_failed || ((test_bit(R5_Insync, &pdev->flags) + pdev = &sh->dev[pd_idx]; + r6s.p_failed = (s.failed >= 1 && r6s.failed_num[0] == pd_idx) + || (s.failed >= 2 && r6s.failed_num[1] == pd_idx); + qdev = &sh->dev[qd_idx]; + r6s.q_failed = (s.failed >= 1 && r6s.failed_num[0] == qd_idx) + || (s.failed >= 2 && r6s.failed_num[1] == qd_idx); + + if ( s.written && + ( r6s.p_failed || ((test_bit(R5_Insync, &pdev->flags) && !test_bit(R5_LOCKED, &pdev->flags) && test_bit(R5_UPTODATE, &pdev->flags)))) && - (s.q_failed || ((test_bit(R5_Insync, &qdev->flags) + ( r6s.q_failed || ((test_bit(R5_Insync, &qdev->flags) && !test_bit(R5_LOCKED, &qdev->flags) && test_bit(R5_UPTODATE, &qdev->flags))))) - handle_stripe_clean_event(conf, sh, disks, &s.return_bi); + handle_stripe_clean_event(conf, sh, disks, &return_bi); /* Now we might consider reading some blocks, either to check/generate * parity, or to satisfy requests * or to load a block that is being partially written. */ - if (s.to_read || s.non_overwrite - || (conf->level == 6 && s.to_write && s.failed) - || (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) - handle_stripe_fill(sh, &s, disks); + if (s.to_read || s.non_overwrite || (s.to_write && s.failed) || + (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) + handle_stripe_fill6(sh, &s, &r6s, disks); /* Now we check to see if any write operations have recently * completed */ - prexor = 0; - if (sh->reconstruct_state == reconstruct_state_prexor_drain_result) - prexor = 1; - if (sh->reconstruct_state == reconstruct_state_drain_result || - sh->reconstruct_state == reconstruct_state_prexor_drain_result) { - sh->reconstruct_state = reconstruct_state_idle; + if (sh->reconstruct_state == reconstruct_state_drain_result) { - /* All the 'written' buffers and the parity block are ready to + sh->reconstruct_state = reconstruct_state_idle; + /* All the 'written' buffers and the parity blocks are ready to * be written back to disk */ BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags)); - BUG_ON(sh->qd_idx >= 0 && - !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags)); + BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags)); for (i = disks; i--; ) { - struct r5dev *dev = &sh->dev[i]; + dev = &sh->dev[i]; if (test_bit(R5_LOCKED, &dev->flags) && - (i == sh->pd_idx || i == sh->qd_idx || - dev->written)) { + (i == sh->pd_idx || i == qd_idx || + dev->written)) { pr_debug("Writing block %d\n", i); + BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); set_bit(R5_Wantwrite, &dev->flags); - if (prexor) - continue; if (!test_bit(R5_Insync, &dev->flags) || - ((i == sh->pd_idx || i == sh->qd_idx) && - s.failed == 0)) + ((i == sh->pd_idx || i == qd_idx) && + s.failed == 0)) set_bit(STRIPE_INSYNC, &sh->state); } } if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) - s.dec_preread_active = 1; + dec_preread_active = 1; } /* Now to consider new write requests and what else, if anything * should be read. We do not handle new writes when: - * 1/ A 'write' operation (copy+xor) is already in flight. + * 1/ A 'write' operation (copy+gen_syndrome) is already in flight. * 2/ A 'check' operation is in flight, as it may clobber the parity * block. */ if (s.to_write && !sh->reconstruct_state && !sh->check_state) - handle_stripe_dirtying(conf, sh, &s, disks); + handle_stripe_dirtying6(conf, sh, &s, &r6s, disks); /* maybe we need to check and possibly fix the parity for this stripe * Any reads will already have been scheduled, so we just see if enough @@ -3252,24 +3494,20 @@ static void handle_stripe(struct stripe_head *sh) if (sh->check_state || (s.syncing && s.locked == 0 && !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && - !test_bit(STRIPE_INSYNC, &sh->state))) { - if (conf->level == 6) - handle_parity_checks6(conf, sh, &s, disks); - else - handle_parity_checks5(conf, sh, &s, disks); - } + !test_bit(STRIPE_INSYNC, &sh->state))) + handle_parity_checks6(conf, sh, &s, &r6s, disks); if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { - md_done_sync(conf->mddev, STRIPE_SECTORS, 1); + md_done_sync(conf->mddev, STRIPE_SECTORS,1); clear_bit(STRIPE_SYNCING, &sh->state); } /* If the failed drives are just a ReadError, then we might need * to progress the repair/check process */ - if (s.failed <= conf->max_degraded && !conf->mddev->ro) + if (s.failed <= 2 && !conf->mddev->ro) for (i = 0; i < s.failed; i++) { - struct r5dev *dev = &sh->dev[s.failed_num[i]]; + dev = &sh->dev[r6s.failed_num[i]]; if (test_bit(R5_ReadError, &dev->flags) && !test_bit(R5_LOCKED, &dev->flags) && test_bit(R5_UPTODATE, &dev->flags) @@ -3288,26 +3526,8 @@ static void handle_stripe(struct stripe_head *sh) } } - /* Finish reconstruct operations initiated by the expansion process */ if (sh->reconstruct_state == reconstruct_state_result) { - struct stripe_head *sh_src - = get_active_stripe(conf, sh->sector, 1, 1, 1); - if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) { - /* sh cannot be written until sh_src has been read. - * so arrange for sh to be delayed a little - */ - set_bit(STRIPE_DELAYED, &sh->state); - set_bit(STRIPE_HANDLE, &sh->state); - if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, - &sh_src->state)) - atomic_inc(&conf->preread_active_stripes); - release_stripe(sh_src); - goto finish; - } - if (sh_src) - release_stripe(sh_src); - sh->reconstruct_state = reconstruct_state_idle; clear_bit(STRIPE_EXPANDING, &sh->state); for (i = conf->raid_disks; i--; ) { @@ -3319,7 +3539,24 @@ static void handle_stripe(struct stripe_head *sh) if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && !sh->reconstruct_state) { - /* Need to write out all blocks after computing parity */ + struct stripe_head *sh2 + = get_active_stripe(conf, sh->sector, 1, 1, 1); + if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) { + /* sh cannot be written until sh2 has been read. + * so arrange for sh to be delayed a little + */ + set_bit(STRIPE_DELAYED, &sh->state); + set_bit(STRIPE_HANDLE, &sh->state); + if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, + &sh2->state)) + atomic_inc(&conf->preread_active_stripes); + release_stripe(sh2); + goto unlock; + } + if (sh2) + release_stripe(sh2); + + /* Need to write out all blocks after computing P&Q */ sh->disks = conf->raid_disks; stripe_set_idx(sh->sector, conf, 0, sh); schedule_reconstruction(sh, &s, 1, 1); @@ -3332,39 +3569,22 @@ static void handle_stripe(struct stripe_head *sh) if (s.expanding && s.locked == 0 && !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) - handle_stripe_expansion(conf, sh); + handle_stripe_expansion(conf, sh, &r6s); -finish: - /* wait for this device to become unblocked */ - if (unlikely(s.blocked_rdev)) - md_wait_for_blocked_rdev(s.blocked_rdev, conf->mddev); + unlock: + spin_unlock(&sh->lock); - if (s.handle_bad_blocks) - for (i = disks; i--; ) { - mdk_rdev_t *rdev; - struct r5dev *dev = &sh->dev[i]; - if (test_and_clear_bit(R5_WriteError, &dev->flags)) { - /* We own a safe reference to the rdev */ - rdev = conf->disks[i].rdev; - if (!rdev_set_badblocks(rdev, sh->sector, - STRIPE_SECTORS, 0)) - md_error(conf->mddev, rdev); - rdev_dec_pending(rdev, conf->mddev); - } - if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { - rdev = conf->disks[i].rdev; - rdev_clear_badblocks(rdev, sh->sector, - STRIPE_SECTORS); - rdev_dec_pending(rdev, conf->mddev); - } - } + /* wait for this device to become unblocked */ + if (unlikely(blocked_rdev)) + md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); if (s.ops_request) raid_run_ops(sh, s.ops_request); ops_run_io(sh, &s); - if (s.dec_preread_active) { + + if (dec_preread_active) { /* We delay this until after ops_run_io so that if make_request * is waiting on a flush, it won't continue until the writes * have actually been submitted. @@ -3375,9 +3595,15 @@ static void handle_stripe(struct stripe_head *sh) md_wakeup_thread(conf->mddev->thread); } - return_io(s.return_bi); + return_io(return_bi); +} - clear_bit(STRIPE_ACTIVE, &sh->state); +static void handle_stripe(struct stripe_head *sh) +{ + if (sh->raid_conf->level == 6) + handle_stripe6(sh); + else + handle_stripe5(sh); } static void raid5_activate_delayed(raid5_conf_t *conf) @@ -3607,9 +3833,6 @@ static int chunk_aligned_read(mddev_t *mddev, struct bio * raid_bio) rcu_read_lock(); rdev = rcu_dereference(conf->disks[dd_idx].rdev); if (rdev && test_bit(In_sync, &rdev->flags)) { - sector_t first_bad; - int bad_sectors; - atomic_inc(&rdev->nr_pending); rcu_read_unlock(); raid_bio->bi_next = (void*)rdev; @@ -3617,10 +3840,8 @@ static int chunk_aligned_read(mddev_t *mddev, struct bio * raid_bio) align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); align_bi->bi_sector += rdev->data_offset; - if (!bio_fits_rdev(align_bi) || - is_badblock(rdev, align_bi->bi_sector, align_bi->bi_size>>9, - &first_bad, &bad_sectors)) { - /* too big in some way, or has a known bad block */ + if (!bio_fits_rdev(align_bi)) { + /* too big in some way */ bio_put(align_bi); rdev_dec_pending(rdev, mddev); return 0; @@ -3795,7 +4016,7 @@ static int make_request(mddev_t *mddev, struct bio * bi) } } - if (rw == WRITE && + if (bio_data_dir(bi) == WRITE && logical_sector >= mddev->suspend_lo && logical_sector < mddev->suspend_hi) { release_stripe(sh); @@ -3813,7 +4034,7 @@ static int make_request(mddev_t *mddev, struct bio * bi) } if (test_bit(STRIPE_EXPANDING, &sh->state) || - !add_stripe_bio(sh, bi, dd_idx, rw)) { + !add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) { /* Stripe is busy expanding or * add failed due to overlap. Flush everything * and wait a while @@ -4154,7 +4375,10 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); - set_bit(STRIPE_SYNC_REQUESTED, &sh->state); + spin_lock(&sh->lock); + set_bit(STRIPE_SYNCING, &sh->state); + clear_bit(STRIPE_INSYNC, &sh->state); + spin_unlock(&sh->lock); handle_stripe(sh); release_stripe(sh); @@ -4285,9 +4509,6 @@ static void raid5d(mddev_t *mddev) release_stripe(sh); cond_resched(); - if (mddev->flags & ~(1<device_lock); } pr_debug("%d stripes handled\n", handled); @@ -5092,7 +5313,6 @@ static int raid5_remove_disk(mddev_t *mddev, int number) * isn't possible. */ if (!test_bit(Faulty, &rdev->flags) && - mddev->recovery_disabled != conf->recovery_disabled && !has_failed(conf) && number < conf->raid_disks) { err = -EBUSY; @@ -5121,9 +5341,6 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) int first = 0; int last = conf->raid_disks - 1; - if (mddev->recovery_disabled == conf->recovery_disabled) - return -EBUSY; - if (has_failed(conf)) /* no point adding a device */ return -EINVAL; @@ -5302,14 +5519,16 @@ static int raid5_start_reshape(mddev_t *mddev) if (rdev->raid_disk < 0 && !test_bit(Faulty, &rdev->flags)) { if (raid5_add_disk(mddev, rdev) == 0) { + char nm[20]; if (rdev->raid_disk >= conf->previous_raid_disks) { set_bit(In_sync, &rdev->flags); added_devices++; } else rdev->recovery_offset = 0; - - if (sysfs_link_rdev(mddev, rdev)) + sprintf(nm, "rd%d", rdev->raid_disk); + if (sysfs_create_link(&mddev->kobj, + &rdev->kobj, nm)) /* Failure here is OK */; } } else if (rdev->raid_disk >= conf->previous_raid_disks @@ -5405,7 +5624,9 @@ static void raid5_finish_reshape(mddev_t *mddev) d++) { mdk_rdev_t *rdev = conf->disks[d].rdev; if (rdev && raid5_remove_disk(mddev, d) == 0) { - sysfs_unlink_rdev(mddev, rdev); + char nm[20]; + sprintf(nm, "rd%d", rdev->raid_disk); + sysfs_remove_link(&mddev->kobj, nm); rdev->raid_disk = -1; } } diff --git a/trunk/drivers/md/raid5.h b/trunk/drivers/md/raid5.h index 11b9566184b2..3ca77a2613ba 100644 --- a/trunk/drivers/md/raid5.h +++ b/trunk/drivers/md/raid5.h @@ -6,11 +6,11 @@ /* * - * Each stripe contains one buffer per device. Each buffer can be in + * Each stripe contains one buffer per disc. Each buffer can be in * one of a number of states stored in "flags". Changes between - * these states happen *almost* exclusively under the protection of the - * STRIPE_ACTIVE flag. Some very specific changes can happen in bi_end_io, and - * these are not protected by STRIPE_ACTIVE. + * these states happen *almost* exclusively under a per-stripe + * spinlock. Some very specific changes can happen in bi_end_io, and + * these are not protected by the spin lock. * * The flag bits that are used to represent these states are: * R5_UPTODATE and R5_LOCKED @@ -76,10 +76,12 @@ * block and the cached buffer are successfully written, any buffer on * a written list can be returned with b_end_io. * - * The write list and read list both act as fifos. The read list, - * write list and written list are protected by the device_lock. - * The device_lock is only for list manipulations and will only be - * held for a very short time. It can be claimed from interrupts. + * The write list and read list both act as fifos. The read list is + * protected by the device_lock. The write and written lists are + * protected by the stripe lock. The device_lock, which can be + * claimed while the stipe lock is held, is only for list + * manipulations and will only be held for a very short time. It can + * be claimed from interrupts. * * * Stripes in the stripe cache can be on one of two lists (or on @@ -94,6 +96,7 @@ * * The inactive_list, handle_list and hash bucket lists are all protected by the * device_lock. + * - stripes on the inactive_list never have their stripe_lock held. * - stripes have a reference counter. If count==0, they are on a list. * - If a stripe might need handling, STRIPE_HANDLE is set. * - When refcount reaches zero, then if STRIPE_HANDLE it is put on @@ -113,10 +116,10 @@ * attach a request to an active stripe (add_stripe_bh()) * lockdev attach-buffer unlockdev * handle a stripe (handle_stripe()) - * setSTRIPE_ACTIVE, clrSTRIPE_HANDLE ... + * lockstripe clrSTRIPE_HANDLE ... * (lockdev check-buffers unlockdev) .. * change-state .. - * record io/ops needed clearSTRIPE_ACTIVE schedule io/ops + * record io/ops needed unlockstripe schedule io/ops * release an active stripe (release_stripe()) * lockdev if (!--cnt) { if STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev * @@ -125,7 +128,8 @@ * on a cached buffer, and plus one if the stripe is undergoing stripe * operations. * - * The stripe operations are: + * Stripe operations are performed outside the stripe lock, + * the stripe operations are: * -copying data between the stripe cache and user application buffers * -computing blocks to save a disk access, or to recover a missing block * -updating the parity on a write operation (reconstruct write and @@ -155,8 +159,7 @@ */ /* - * Operations state - intermediate states that are visible outside of - * STRIPE_ACTIVE. + * Operations state - intermediate states that are visible outside of sh->lock * In general _idle indicates nothing is running, _run indicates a data * processing operation is active, and _result means the data processing result * is stable and can be acted upon. For simple operations like biofill and @@ -206,6 +209,7 @@ struct stripe_head { short ddf_layout;/* use DDF ordering to calculate Q */ unsigned long state; /* state flags */ atomic_t count; /* nr of active thread/requests */ + spinlock_t lock; int bm_seq; /* sequence number for bitmap flushes */ int disks; /* disks in stripe */ enum check_states check_state; @@ -236,20 +240,19 @@ struct stripe_head { }; /* stripe_head_state - collects and tracks the dynamic state of a stripe_head - * for handle_stripe. + * for handle_stripe. It is only valid under spin_lock(sh->lock); */ struct stripe_head_state { int syncing, expanding, expanded; int locked, uptodate, to_read, to_write, failed, written; int to_fill, compute, req_compute, non_overwrite; - int failed_num[2]; - int p_failed, q_failed; - int dec_preread_active; + int failed_num; unsigned long ops_request; +}; - struct bio *return_bi; - mdk_rdev_t *blocked_rdev; - int handle_bad_blocks; +/* r6_state - extra state data only relevant to r6 */ +struct r6_state { + int p_failed, q_failed, failed_num[2]; }; /* Flags */ @@ -265,16 +268,14 @@ struct stripe_head_state { #define R5_ReWrite 9 /* have tried to over-write the readerror */ #define R5_Expanded 10 /* This block now has post-expand data */ -#define R5_Wantcompute 11 /* compute_block in progress treat as - * uptodate - */ -#define R5_Wantfill 12 /* dev->toread contains a bio that needs - * filling - */ -#define R5_Wantdrain 13 /* dev->towrite needs to be drained */ -#define R5_WantFUA 14 /* Write should be FUA */ -#define R5_WriteError 15 /* got a write error - need to record it */ -#define R5_MadeGood 16 /* A bad block has been fixed by writing to it*/ +#define R5_Wantcompute 11 /* compute_block in progress treat as + * uptodate + */ +#define R5_Wantfill 12 /* dev->toread contains a bio that needs + * filling + */ +#define R5_Wantdrain 13 /* dev->towrite needs to be drained */ +#define R5_WantFUA 14 /* Write should be FUA */ /* * Write method */ @@ -288,25 +289,21 @@ struct stripe_head_state { /* * Stripe state */ -enum { - STRIPE_ACTIVE, - STRIPE_HANDLE, - STRIPE_SYNC_REQUESTED, - STRIPE_SYNCING, - STRIPE_INSYNC, - STRIPE_PREREAD_ACTIVE, - STRIPE_DELAYED, - STRIPE_DEGRADED, - STRIPE_BIT_DELAY, - STRIPE_EXPANDING, - STRIPE_EXPAND_SOURCE, - STRIPE_EXPAND_READY, - STRIPE_IO_STARTED, /* do not count towards 'bypass_count' */ - STRIPE_FULL_WRITE, /* all blocks are set to be overwritten */ - STRIPE_BIOFILL_RUN, - STRIPE_COMPUTE_RUN, - STRIPE_OPS_REQ_PENDING, -}; +#define STRIPE_HANDLE 2 +#define STRIPE_SYNCING 3 +#define STRIPE_INSYNC 4 +#define STRIPE_PREREAD_ACTIVE 5 +#define STRIPE_DELAYED 6 +#define STRIPE_DEGRADED 7 +#define STRIPE_BIT_DELAY 8 +#define STRIPE_EXPANDING 9 +#define STRIPE_EXPAND_SOURCE 10 +#define STRIPE_EXPAND_READY 11 +#define STRIPE_IO_STARTED 12 /* do not count towards 'bypass_count' */ +#define STRIPE_FULL_WRITE 13 /* all blocks are set to be overwritten */ +#define STRIPE_BIOFILL_RUN 14 +#define STRIPE_COMPUTE_RUN 15 +#define STRIPE_OPS_REQ_PENDING 16 /* * Operation request flags @@ -339,7 +336,7 @@ enum { * PREREAD_ACTIVE. * In stripe_handle, if we find pre-reading is necessary, we do it if * PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue. - * HANDLE gets cleared if stripe_handle leaves nothing locked. + * HANDLE gets cleared if stripe_handle leave nothing locked. */ @@ -402,7 +399,7 @@ struct raid5_private_data { * (fresh device added). * Cleared when a sync completes. */ - int recovery_disabled; + /* per cpu variables */ struct raid5_percpu { struct page *spare_page; /* Used when checking P/Q in raid6 */ diff --git a/trunk/drivers/net/Makefile b/trunk/drivers/net/Makefile index e1eca2ab505e..b7622c3745fa 100644 --- a/trunk/drivers/net/Makefile +++ b/trunk/drivers/net/Makefile @@ -282,7 +282,6 @@ obj-$(CONFIG_USB_HSO) += usb/ obj-$(CONFIG_USB_USBNET) += usb/ obj-$(CONFIG_USB_ZD1201) += usb/ obj-$(CONFIG_USB_IPHETH) += usb/ -obj-$(CONFIG_USB_CDC_PHONET) += usb/ obj-$(CONFIG_WLAN) += wireless/ obj-$(CONFIG_NET_TULIP) += tulip/ diff --git a/trunk/drivers/net/acenic.c b/trunk/drivers/net/acenic.c index 31798f5f5d06..536038b22710 100644 --- a/trunk/drivers/net/acenic.c +++ b/trunk/drivers/net/acenic.c @@ -1502,13 +1502,13 @@ static int __devinit ace_init(struct net_device *dev) * firmware to wipe the ring without re-initializing it. */ if (!test_and_set_bit(0, &ap->std_refill_busy)) - ace_load_std_rx_ring(dev, RX_RING_SIZE); + ace_load_std_rx_ring(ap, RX_RING_SIZE); else printk(KERN_ERR "%s: Someone is busy refilling the RX ring\n", ap->name); if (ap->version >= 2) { if (!test_and_set_bit(0, &ap->mini_refill_busy)) - ace_load_mini_rx_ring(dev, RX_MINI_SIZE); + ace_load_mini_rx_ring(ap, RX_MINI_SIZE); else printk(KERN_ERR "%s: Someone is busy refilling " "the RX mini ring\n", ap->name); @@ -1584,10 +1584,9 @@ static void ace_watchdog(struct net_device *data) } -static void ace_tasklet(unsigned long arg) +static void ace_tasklet(unsigned long dev) { - struct net_device *dev = (struct net_device *) arg; - struct ace_private *ap = netdev_priv(dev); + struct ace_private *ap = netdev_priv((struct net_device *)dev); int cur_size; cur_size = atomic_read(&ap->cur_rx_bufs); @@ -1596,7 +1595,7 @@ static void ace_tasklet(unsigned long arg) #ifdef DEBUG printk("refilling buffers (current %i)\n", cur_size); #endif - ace_load_std_rx_ring(dev, RX_RING_SIZE - cur_size); + ace_load_std_rx_ring(ap, RX_RING_SIZE - cur_size); } if (ap->version >= 2) { @@ -1607,7 +1606,7 @@ static void ace_tasklet(unsigned long arg) printk("refilling mini buffers (current %i)\n", cur_size); #endif - ace_load_mini_rx_ring(dev, RX_MINI_SIZE - cur_size); + ace_load_mini_rx_ring(ap, RX_MINI_SIZE - cur_size); } } @@ -1617,7 +1616,7 @@ static void ace_tasklet(unsigned long arg) #ifdef DEBUG printk("refilling jumbo buffers (current %i)\n", cur_size); #endif - ace_load_jumbo_rx_ring(dev, RX_JUMBO_SIZE - cur_size); + ace_load_jumbo_rx_ring(ap, RX_JUMBO_SIZE - cur_size); } ap->tasklet_pending = 0; } @@ -1643,9 +1642,8 @@ static void ace_dump_trace(struct ace_private *ap) * done only before the device is enabled, thus no interrupts are * generated and by the interrupt handler/tasklet handler. */ -static void ace_load_std_rx_ring(struct net_device *dev, int nr_bufs) +static void ace_load_std_rx_ring(struct ace_private *ap, int nr_bufs) { - struct ace_private *ap = netdev_priv(dev); struct ace_regs __iomem *regs = ap->regs; short i, idx; @@ -1659,10 +1657,11 @@ static void ace_load_std_rx_ring(struct net_device *dev, int nr_bufs) struct rx_desc *rd; dma_addr_t mapping; - skb = netdev_alloc_skb_ip_align(dev, ACE_STD_BUFSIZE); + skb = dev_alloc_skb(ACE_STD_BUFSIZE + NET_IP_ALIGN); if (!skb) break; + skb_reserve(skb, NET_IP_ALIGN); mapping = pci_map_page(ap->pdev, virt_to_page(skb->data), offset_in_page(skb->data), ACE_STD_BUFSIZE, @@ -1706,9 +1705,8 @@ static void ace_load_std_rx_ring(struct net_device *dev, int nr_bufs) } -static void ace_load_mini_rx_ring(struct net_device *dev, int nr_bufs) +static void ace_load_mini_rx_ring(struct ace_private *ap, int nr_bufs) { - struct ace_private *ap = netdev_priv(dev); struct ace_regs __iomem *regs = ap->regs; short i, idx; @@ -1720,10 +1718,11 @@ static void ace_load_mini_rx_ring(struct net_device *dev, int nr_bufs) struct rx_desc *rd; dma_addr_t mapping; - skb = netdev_alloc_skb_ip_align(dev, ACE_MINI_BUFSIZE); + skb = dev_alloc_skb(ACE_MINI_BUFSIZE + NET_IP_ALIGN); if (!skb) break; + skb_reserve(skb, NET_IP_ALIGN); mapping = pci_map_page(ap->pdev, virt_to_page(skb->data), offset_in_page(skb->data), ACE_MINI_BUFSIZE, @@ -1763,9 +1762,8 @@ static void ace_load_mini_rx_ring(struct net_device *dev, int nr_bufs) * Load the jumbo rx ring, this may happen at any time if the MTU * is changed to a value > 1500. */ -static void ace_load_jumbo_rx_ring(struct net_device *dev, int nr_bufs) +static void ace_load_jumbo_rx_ring(struct ace_private *ap, int nr_bufs) { - struct ace_private *ap = netdev_priv(dev); struct ace_regs __iomem *regs = ap->regs; short i, idx; @@ -1776,10 +1774,11 @@ static void ace_load_jumbo_rx_ring(struct net_device *dev, int nr_bufs) struct rx_desc *rd; dma_addr_t mapping; - skb = netdev_alloc_skb_ip_align(dev, ACE_JUMBO_BUFSIZE); + skb = dev_alloc_skb(ACE_JUMBO_BUFSIZE + NET_IP_ALIGN); if (!skb) break; + skb_reserve(skb, NET_IP_ALIGN); mapping = pci_map_page(ap->pdev, virt_to_page(skb->data), offset_in_page(skb->data), ACE_JUMBO_BUFSIZE, @@ -2197,7 +2196,7 @@ static irqreturn_t ace_interrupt(int irq, void *dev_id) #ifdef DEBUG printk("low on std buffers %i\n", cur_size); #endif - ace_load_std_rx_ring(dev, + ace_load_std_rx_ring(ap, RX_RING_SIZE - cur_size); } else run_tasklet = 1; @@ -2213,8 +2212,7 @@ static irqreturn_t ace_interrupt(int irq, void *dev_id) printk("low on mini buffers %i\n", cur_size); #endif - ace_load_mini_rx_ring(dev, - RX_MINI_SIZE - cur_size); + ace_load_mini_rx_ring(ap, RX_MINI_SIZE - cur_size); } else run_tasklet = 1; } @@ -2230,8 +2228,7 @@ static irqreturn_t ace_interrupt(int irq, void *dev_id) printk("low on jumbo buffers %i\n", cur_size); #endif - ace_load_jumbo_rx_ring(dev, - RX_JUMBO_SIZE - cur_size); + ace_load_jumbo_rx_ring(ap, RX_JUMBO_SIZE - cur_size); } else run_tasklet = 1; } @@ -2270,7 +2267,7 @@ static int ace_open(struct net_device *dev) if (ap->jumbo && !test_and_set_bit(0, &ap->jumbo_refill_busy)) - ace_load_jumbo_rx_ring(dev, RX_JUMBO_SIZE); + ace_load_jumbo_rx_ring(ap, RX_JUMBO_SIZE); if (dev->flags & IFF_PROMISC) { cmd.evt = C_SET_PROMISC_MODE; @@ -2578,7 +2575,7 @@ static int ace_change_mtu(struct net_device *dev, int new_mtu) "support\n", dev->name); ap->jumbo = 1; if (!test_and_set_bit(0, &ap->jumbo_refill_busy)) - ace_load_jumbo_rx_ring(dev, RX_JUMBO_SIZE); + ace_load_jumbo_rx_ring(ap, RX_JUMBO_SIZE); ace_set_rxtx_parms(dev, 1); } } else { diff --git a/trunk/drivers/net/acenic.h b/trunk/drivers/net/acenic.h index 51c486cfbb8c..f67dc9b0eb80 100644 --- a/trunk/drivers/net/acenic.h +++ b/trunk/drivers/net/acenic.h @@ -766,9 +766,9 @@ static inline void ace_unmask_irq(struct net_device *dev) * Prototypes */ static int ace_init(struct net_device *dev); -static void ace_load_std_rx_ring(struct net_device *dev, int nr_bufs); -static void ace_load_mini_rx_ring(struct net_device *dev, int nr_bufs); -static void ace_load_jumbo_rx_ring(struct net_device *dev, int nr_bufs); +static void ace_load_std_rx_ring(struct ace_private *ap, int nr_bufs); +static void ace_load_mini_rx_ring(struct ace_private *ap, int nr_bufs); +static void ace_load_jumbo_rx_ring(struct ace_private *ap, int nr_bufs); static irqreturn_t ace_interrupt(int irq, void *dev_id); static int ace_load_firmware(struct net_device *dev); static int ace_open(struct net_device *dev); diff --git a/trunk/drivers/net/bonding/bond_main.c b/trunk/drivers/net/bonding/bond_main.c index 38a83acd502e..02842d05c11f 100644 --- a/trunk/drivers/net/bonding/bond_main.c +++ b/trunk/drivers/net/bonding/bond_main.c @@ -1557,10 +1557,8 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev) if (slave_dev->type != ARPHRD_ETHER) bond_setup_by_slave(bond_dev, slave_dev); - else { + else ether_setup(bond_dev); - bond_dev->priv_flags &= ~IFF_TX_SKB_SHARING; - } netdev_bonding_change(bond_dev, NETDEV_POST_TYPE_CHANGE); @@ -4332,7 +4330,7 @@ static void bond_setup(struct net_device *bond_dev) bond_dev->tx_queue_len = 0; bond_dev->flags |= IFF_MASTER|IFF_MULTICAST; bond_dev->priv_flags |= IFF_BONDING; - bond_dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING); + bond_dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; /* At first, we block adding VLANs. That's the only way to * prevent problems that occur when adding VLANs over an @@ -4693,7 +4691,7 @@ static int bond_check_params(struct bond_params *params) /* miimon and arp_interval not set, we need one so things * work as expected, see bonding.txt for details */ - pr_debug("Warning: either miimon or arp_interval and arp_ip_target module parameters must be specified, otherwise bonding will not detect link failures! see bonding.txt for details.\n"); + pr_warning("Warning: either miimon or arp_interval and arp_ip_target module parameters must be specified, otherwise bonding will not detect link failures! see bonding.txt for details.\n"); } if (primary && !USES_PRIMARY(bond_mode)) { diff --git a/trunk/drivers/net/bonding/bond_sysfs.c b/trunk/drivers/net/bonding/bond_sysfs.c index 2dfb4bf90087..b60835f58650 100644 --- a/trunk/drivers/net/bonding/bond_sysfs.c +++ b/trunk/drivers/net/bonding/bond_sysfs.c @@ -1025,7 +1025,6 @@ static ssize_t bonding_store_primary(struct device *d, int i; struct slave *slave; struct bonding *bond = to_bond(d); - char ifname[IFNAMSIZ]; if (!rtnl_trylock()) return restart_syscall(); @@ -1036,33 +1035,32 @@ static ssize_t bonding_store_primary(struct device *d, if (!USES_PRIMARY(bond->params.mode)) { pr_info("%s: Unable to set primary slave; %s is in mode %d\n", bond->dev->name, bond->dev->name, bond->params.mode); - goto out; - } - - sscanf(buf, "%16s", ifname); /* IFNAMSIZ */ + } else { + bond_for_each_slave(bond, slave, i) { + if (strnicmp + (slave->dev->name, buf, + strlen(slave->dev->name)) == 0) { + pr_info("%s: Setting %s as primary slave.\n", + bond->dev->name, slave->dev->name); + bond->primary_slave = slave; + strcpy(bond->params.primary, slave->dev->name); + bond_select_active_slave(bond); + goto out; + } + } - /* check to see if we are clearing primary */ - if (!strlen(ifname) || buf[0] == '\n') { - pr_info("%s: Setting primary slave to None.\n", - bond->dev->name); - bond->primary_slave = NULL; - bond_select_active_slave(bond); - goto out; - } + /* if we got here, then we didn't match the name of any slave */ - bond_for_each_slave(bond, slave, i) { - if (strncmp(slave->dev->name, ifname, IFNAMSIZ) == 0) { - pr_info("%s: Setting %s as primary slave.\n", - bond->dev->name, slave->dev->name); - bond->primary_slave = slave; - strcpy(bond->params.primary, slave->dev->name); - bond_select_active_slave(bond); - goto out; + if (strlen(buf) == 0 || buf[0] == '\n') { + pr_info("%s: Setting primary slave to None.\n", + bond->dev->name); + bond->primary_slave = NULL; + bond_select_active_slave(bond); + } else { + pr_info("%s: Unable to set %.*s as primary slave as it is not a slave.\n", + bond->dev->name, (int)strlen(buf) - 1, buf); } } - - pr_info("%s: Unable to set %.*s as primary slave.\n", - bond->dev->name, (int)strlen(buf) - 1, buf); out: write_unlock_bh(&bond->curr_slave_lock); read_unlock(&bond->lock); @@ -1197,7 +1195,6 @@ static ssize_t bonding_store_active_slave(struct device *d, struct slave *old_active = NULL; struct slave *new_active = NULL; struct bonding *bond = to_bond(d); - char ifname[IFNAMSIZ]; if (!rtnl_trylock()) return restart_syscall(); @@ -1206,62 +1203,56 @@ static ssize_t bonding_store_active_slave(struct device *d, read_lock(&bond->lock); write_lock_bh(&bond->curr_slave_lock); - if (!USES_PRIMARY(bond->params.mode)) { + if (!USES_PRIMARY(bond->params.mode)) pr_info("%s: Unable to change active slave; %s is in mode %d\n", bond->dev->name, bond->dev->name, bond->params.mode); - goto out; - } - - sscanf(buf, "%16s", ifname); /* IFNAMSIZ */ - - /* check to see if we are clearing active */ - if (!strlen(ifname) || buf[0] == '\n') { - pr_info("%s: Clearing current active slave.\n", - bond->dev->name); - bond->curr_active_slave = NULL; - bond_select_active_slave(bond); - goto out; - } - - bond_for_each_slave(bond, slave, i) { - if (strncmp(slave->dev->name, ifname, IFNAMSIZ) == 0) { - old_active = bond->curr_active_slave; - new_active = slave; - if (new_active == old_active) { - /* do nothing */ - pr_info("%s: %s is already the current" - " active slave.\n", - bond->dev->name, - slave->dev->name); - goto out; - } - else { - if ((new_active) && - (old_active) && - (new_active->link == BOND_LINK_UP) && - IS_UP(new_active->dev)) { - pr_info("%s: Setting %s as active" - " slave.\n", + else { + bond_for_each_slave(bond, slave, i) { + if (strnicmp + (slave->dev->name, buf, + strlen(slave->dev->name)) == 0) { + old_active = bond->curr_active_slave; + new_active = slave; + if (new_active == old_active) { + /* do nothing */ + pr_info("%s: %s is already the current active slave.\n", bond->dev->name, slave->dev->name); - bond_change_active_slave(bond, - new_active); + goto out; } else { - pr_info("%s: Could not set %s as" - " active slave; either %s is" - " down or the link is down.\n", - bond->dev->name, - slave->dev->name, - slave->dev->name); + if ((new_active) && + (old_active) && + (new_active->link == BOND_LINK_UP) && + IS_UP(new_active->dev)) { + pr_info("%s: Setting %s as active slave.\n", + bond->dev->name, + slave->dev->name); + bond_change_active_slave(bond, new_active); + } + else { + pr_info("%s: Could not set %s as active slave; either %s is down or the link is down.\n", + bond->dev->name, + slave->dev->name, + slave->dev->name); + } + goto out; } - goto out; } } - } - pr_info("%s: Unable to set %.*s as active slave.\n", - bond->dev->name, (int)strlen(buf) - 1, buf); + /* if we got here, then we didn't match the name of any slave */ + + if (strlen(buf) == 0 || buf[0] == '\n') { + pr_info("%s: Setting active slave to None.\n", + bond->dev->name); + bond->primary_slave = NULL; + bond_select_active_slave(bond); + } else { + pr_info("%s: Unable to set %.*s as active slave as it is not a slave.\n", + bond->dev->name, (int)strlen(buf) - 1, buf); + } + } out: write_unlock_bh(&bond->curr_slave_lock); read_unlock(&bond->lock); diff --git a/trunk/drivers/net/forcedeth.c b/trunk/drivers/net/forcedeth.c index e55df308a3af..e64cd9ceac3f 100644 --- a/trunk/drivers/net/forcedeth.c +++ b/trunk/drivers/net/forcedeth.c @@ -2764,14 +2764,7 @@ static int nv_rx_process_optimized(struct net_device *dev, int limit) prefetch(skb->data); vlanflags = le32_to_cpu(np->get_rx.ex->buflow); - - /* - * There's need to check for NETIF_F_HW_VLAN_RX here. - * Even if vlan rx accel is disabled, - * NV_RX3_VLAN_TAG_PRESENT is pseudo randomly set. - */ - if (dev->features & NETIF_F_HW_VLAN_RX && - vlanflags & NV_RX3_VLAN_TAG_PRESENT) { + if (vlanflags & NV_RX3_VLAN_TAG_PRESENT) { u16 vid = vlanflags & NV_RX3_VLAN_TAG_MASK; __vlan_hwaccel_put_tag(skb, vid); @@ -5338,16 +5331,15 @@ static int __devinit nv_probe(struct pci_dev *pci_dev, const struct pci_device_i np->txrxctl_bits |= NVREG_TXRXCTL_RXCHECK; dev->hw_features |= NETIF_F_IP_CSUM | NETIF_F_SG | NETIF_F_TSO | NETIF_F_RXCSUM; + dev->features |= dev->hw_features; } np->vlanctl_bits = 0; if (id->driver_data & DEV_HAS_VLAN) { np->vlanctl_bits = NVREG_VLANCONTROL_ENABLE; - dev->hw_features |= NETIF_F_HW_VLAN_RX | NETIF_F_HW_VLAN_TX; + dev->features |= NETIF_F_HW_VLAN_RX | NETIF_F_HW_VLAN_TX; } - dev->features |= dev->hw_features; - np->pause_flags = NV_PAUSEFRAME_RX_CAPABLE | NV_PAUSEFRAME_RX_REQ | NV_PAUSEFRAME_AUTONEG; if ((id->driver_data & DEV_HAS_PAUSEFRAME_TX_V1) || (id->driver_data & DEV_HAS_PAUSEFRAME_TX_V2) || @@ -5615,8 +5607,6 @@ static int __devinit nv_probe(struct pci_dev *pci_dev, const struct pci_device_i goto out_error; } - nv_vlan_mode(dev, dev->features); - netif_carrier_off(dev); dev_info(&pci_dev->dev, "ifname %s, PHY OUI 0x%x @ %d, addr %pM\n", diff --git a/trunk/drivers/net/gianfar.c b/trunk/drivers/net/gianfar.c index 2659daad783d..835cd2588148 100644 --- a/trunk/drivers/net/gianfar.c +++ b/trunk/drivers/net/gianfar.c @@ -388,9 +388,13 @@ static void gfar_init_mac(struct net_device *ndev) if (priv->hwts_rx_en) rctrl |= RCTRL_PRSDEP_INIT | RCTRL_TS_ENABLE; - if (ndev->features & NETIF_F_HW_VLAN_RX) + /* keep vlan related bits if it's enabled */ + if (ndev->features & NETIF_F_HW_VLAN_TX) rctrl |= RCTRL_VLEX | RCTRL_PRSDEP_INIT; + if (ndev->features & NETIF_F_HW_VLAN_RX) + tctrl |= TCTRL_VLINS; + /* Init rctrl based on our settings */ gfar_write(®s->rctrl, rctrl); diff --git a/trunk/drivers/net/ifb.c b/trunk/drivers/net/ifb.c index 46b5f5fd686b..6e82dd32e806 100644 --- a/trunk/drivers/net/ifb.c +++ b/trunk/drivers/net/ifb.c @@ -183,7 +183,7 @@ static void ifb_setup(struct net_device *dev) dev->flags |= IFF_NOARP; dev->flags &= ~IFF_MULTICAST; - dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING); + dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; random_ether_addr(dev->dev_addr); } diff --git a/trunk/drivers/net/macvlan.c b/trunk/drivers/net/macvlan.c index 05172c39a0ce..ba631fcece34 100644 --- a/trunk/drivers/net/macvlan.c +++ b/trunk/drivers/net/macvlan.c @@ -572,7 +572,7 @@ void macvlan_common_setup(struct net_device *dev) { ether_setup(dev); - dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING); + dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; dev->netdev_ops = &macvlan_netdev_ops; dev->destructor = free_netdev; dev->header_ops = &macvlan_hard_header_ops, diff --git a/trunk/drivers/net/tg3.c b/trunk/drivers/net/tg3.c index dc3fbf61910b..803576568154 100644 --- a/trunk/drivers/net/tg3.c +++ b/trunk/drivers/net/tg3.c @@ -190,7 +190,6 @@ static inline void _tg3_flag_clear(enum TG3_FLAGS flag, unsigned long *bits) /* minimum number of free TX descriptors required to wake up TX process */ #define TG3_TX_WAKEUP_THRESH(tnapi) ((tnapi)->tx_pending / 4) -#define TG3_TX_BD_DMA_MAX 4096 #define TG3_RAW_IP_ALIGN 2 @@ -4825,7 +4824,7 @@ static void tg3_tx(struct tg3_napi *tnapi) txq = netdev_get_tx_queue(tp->dev, index); while (sw_idx != hw_idx) { - struct tg3_tx_ring_info *ri = &tnapi->tx_buffers[sw_idx]; + struct ring_info *ri = &tnapi->tx_buffers[sw_idx]; struct sk_buff *skb = ri->skb; int i, tx_bug = 0; @@ -4841,12 +4840,6 @@ static void tg3_tx(struct tg3_napi *tnapi) ri->skb = NULL; - while (ri->fragmented) { - ri->fragmented = false; - sw_idx = NEXT_TX(sw_idx); - ri = &tnapi->tx_buffers[sw_idx]; - } - sw_idx = NEXT_TX(sw_idx); for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { @@ -4858,13 +4851,6 @@ static void tg3_tx(struct tg3_napi *tnapi) dma_unmap_addr(ri, mapping), skb_shinfo(skb)->frags[i].size, PCI_DMA_TODEVICE); - - while (ri->fragmented) { - ri->fragmented = false; - sw_idx = NEXT_TX(sw_idx); - ri = &tnapi->tx_buffers[sw_idx]; - } - sw_idx = NEXT_TX(sw_idx); } @@ -5915,100 +5901,40 @@ static inline int tg3_40bit_overflow_test(struct tg3 *tp, dma_addr_t mapping, #endif } -static inline void tg3_tx_set_bd(struct tg3_tx_buffer_desc *txbd, - dma_addr_t mapping, u32 len, u32 flags, - u32 mss, u32 vlan) -{ - txbd->addr_hi = ((u64) mapping >> 32); - txbd->addr_lo = ((u64) mapping & 0xffffffff); - txbd->len_flags = (len << TXD_LEN_SHIFT) | (flags & 0x0000ffff); - txbd->vlan_tag = (mss << TXD_MSS_SHIFT) | (vlan << TXD_VLAN_TAG_SHIFT); -} - -static bool tg3_tx_frag_set(struct tg3_napi *tnapi, u32 *entry, u32 *budget, - dma_addr_t map, u32 len, u32 flags, - u32 mss, u32 vlan) +static void tg3_set_txd(struct tg3_napi *tnapi, int entry, + dma_addr_t mapping, int len, u32 flags, + u32 mss_and_is_end) { - struct tg3 *tp = tnapi->tp; - bool hwbug = false; - - if (tg3_flag(tp, SHORT_DMA_BUG) && len <= 8) - hwbug = 1; - - if (tg3_4g_overflow_test(map, len)) - hwbug = 1; - - if (tg3_40bit_overflow_test(tp, map, len)) - hwbug = 1; - - if (tg3_flag(tp, 4K_FIFO_LIMIT)) { - u32 tmp_flag = flags & ~TXD_FLAG_END; - while (len > TG3_TX_BD_DMA_MAX) { - u32 frag_len = TG3_TX_BD_DMA_MAX; - len -= TG3_TX_BD_DMA_MAX; - - if (len) { - tnapi->tx_buffers[*entry].fragmented = true; - /* Avoid the 8byte DMA problem */ - if (len <= 8) { - len += TG3_TX_BD_DMA_MAX / 2; - frag_len = TG3_TX_BD_DMA_MAX / 2; - } - } else - tmp_flag = flags; - - if (*budget) { - tg3_tx_set_bd(&tnapi->tx_ring[*entry], map, - frag_len, tmp_flag, mss, vlan); - (*budget)--; - *entry = NEXT_TX(*entry); - } else { - hwbug = 1; - break; - } - - map += frag_len; - } + struct tg3_tx_buffer_desc *txd = &tnapi->tx_ring[entry]; + int is_end = (mss_and_is_end & 0x1); + u32 mss = (mss_and_is_end >> 1); + u32 vlan_tag = 0; - if (len) { - if (*budget) { - tg3_tx_set_bd(&tnapi->tx_ring[*entry], map, - len, flags, mss, vlan); - (*budget)--; - *entry = NEXT_TX(*entry); - } else { - hwbug = 1; - } - } - } else { - tg3_tx_set_bd(&tnapi->tx_ring[*entry], map, - len, flags, mss, vlan); - *entry = NEXT_TX(*entry); + if (is_end) + flags |= TXD_FLAG_END; + if (flags & TXD_FLAG_VLAN) { + vlan_tag = flags >> 16; + flags &= 0xffff; } + vlan_tag |= (mss << TXD_MSS_SHIFT); - return hwbug; + txd->addr_hi = ((u64) mapping >> 32); + txd->addr_lo = ((u64) mapping & 0xffffffff); + txd->len_flags = (len << TXD_LEN_SHIFT) | flags; + txd->vlan_tag = vlan_tag << TXD_VLAN_TAG_SHIFT; } -static void tg3_tx_skb_unmap(struct tg3_napi *tnapi, u32 entry, int last) +static void tg3_skb_error_unmap(struct tg3_napi *tnapi, + struct sk_buff *skb, int last) { int i; - struct sk_buff *skb; - struct tg3_tx_ring_info *txb = &tnapi->tx_buffers[entry]; - - skb = txb->skb; - txb->skb = NULL; + u32 entry = tnapi->tx_prod; + struct ring_info *txb = &tnapi->tx_buffers[entry]; pci_unmap_single(tnapi->tp->pdev, dma_unmap_addr(txb, mapping), skb_headlen(skb), PCI_DMA_TODEVICE); - - while (txb->fragmented) { - txb->fragmented = false; - entry = NEXT_TX(entry); - txb = &tnapi->tx_buffers[entry]; - } - for (i = 0; i < last; i++) { skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; @@ -6018,24 +5944,18 @@ static void tg3_tx_skb_unmap(struct tg3_napi *tnapi, u32 entry, int last) pci_unmap_page(tnapi->tp->pdev, dma_unmap_addr(txb, mapping), frag->size, PCI_DMA_TODEVICE); - - while (txb->fragmented) { - txb->fragmented = false; - entry = NEXT_TX(entry); - txb = &tnapi->tx_buffers[entry]; - } } } /* Workaround 4GB and 40-bit hardware DMA bugs. */ static int tigon3_dma_hwbug_workaround(struct tg3_napi *tnapi, struct sk_buff *skb, - u32 *entry, u32 *budget, - u32 base_flags, u32 mss, u32 vlan) + u32 base_flags, u32 mss) { struct tg3 *tp = tnapi->tp; struct sk_buff *new_skb; dma_addr_t new_addr = 0; + u32 entry = tnapi->tx_prod; int ret = 0; if (GET_ASIC_REV(tp->pci_chip_rev_id) != ASIC_REV_5701) @@ -6056,22 +5976,24 @@ static int tigon3_dma_hwbug_workaround(struct tg3_napi *tnapi, PCI_DMA_TODEVICE); /* Make sure the mapping succeeded */ if (pci_dma_mapping_error(tp->pdev, new_addr)) { + ret = -1; dev_kfree_skb(new_skb); + + /* Make sure new skb does not cross any 4G boundaries. + * Drop the packet if it does. + */ + } else if (tg3_4g_overflow_test(new_addr, new_skb->len)) { + pci_unmap_single(tp->pdev, new_addr, new_skb->len, + PCI_DMA_TODEVICE); ret = -1; + dev_kfree_skb(new_skb); } else { - base_flags |= TXD_FLAG_END; - - tnapi->tx_buffers[*entry].skb = new_skb; - dma_unmap_addr_set(&tnapi->tx_buffers[*entry], + tnapi->tx_buffers[entry].skb = new_skb; + dma_unmap_addr_set(&tnapi->tx_buffers[entry], mapping, new_addr); - if (tg3_tx_frag_set(tnapi, entry, budget, new_addr, - new_skb->len, base_flags, - mss, vlan)) { - tg3_tx_skb_unmap(tnapi, *entry, 0); - dev_kfree_skb(new_skb); - ret = -1; - } + tg3_set_txd(tnapi, entry, new_addr, new_skb->len, + base_flags, 1 | (mss << 1)); } } @@ -6129,8 +6051,7 @@ static int tg3_tso_bug(struct tg3 *tp, struct sk_buff *skb) static netdev_tx_t tg3_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct tg3 *tp = netdev_priv(dev); - u32 len, entry, base_flags, mss, vlan = 0; - u32 budget; + u32 len, entry, base_flags, mss; int i = -1, would_hit_hwbug; dma_addr_t mapping; struct tg3_napi *tnapi; @@ -6142,14 +6063,12 @@ static netdev_tx_t tg3_start_xmit(struct sk_buff *skb, struct net_device *dev) if (tg3_flag(tp, ENABLE_TSS)) tnapi++; - budget = tg3_tx_avail(tnapi); - /* We are running in BH disabled context with netif_tx_lock * and TX reclaim runs via tp->napi.poll inside of a software * interrupt. Furthermore, IRQ processing runs lockless so we have * no IRQ context deadlocks to worry about either. Rejoice! */ - if (unlikely(budget <= (skb_shinfo(skb)->nr_frags + 1))) { + if (unlikely(tg3_tx_avail(tnapi) <= (skb_shinfo(skb)->nr_frags + 1))) { if (!netif_tx_queue_stopped(txq)) { netif_tx_stop_queue(txq); @@ -6234,12 +6153,9 @@ static netdev_tx_t tg3_start_xmit(struct sk_buff *skb, struct net_device *dev) } } -#ifdef BCM_KERNEL_SUPPORTS_8021Q - if (vlan_tx_tag_present(skb)) { - base_flags |= TXD_FLAG_VLAN; - vlan = vlan_tx_tag_get(skb); - } -#endif + if (vlan_tx_tag_present(skb)) + base_flags |= (TXD_FLAG_VLAN | + (vlan_tx_tag_get(skb) << 16)); if (tg3_flag(tp, USE_JUMBO_BDFLAG) && !mss && skb->len > VLAN_ETH_FRAME_LEN) @@ -6258,23 +6174,25 @@ static netdev_tx_t tg3_start_xmit(struct sk_buff *skb, struct net_device *dev) would_hit_hwbug = 0; - if (tg3_flag(tp, 5701_DMA_BUG)) + if (tg3_flag(tp, SHORT_DMA_BUG) && len <= 8) would_hit_hwbug = 1; - if (tg3_tx_frag_set(tnapi, &entry, &budget, mapping, len, base_flags | - ((skb_shinfo(skb)->nr_frags == 0) ? TXD_FLAG_END : 0), - mss, vlan)) + if (tg3_4g_overflow_test(mapping, len)) would_hit_hwbug = 1; - /* Now loop through additional data fragments, and queue them. */ - if (skb_shinfo(skb)->nr_frags > 0) { - u32 tmp_mss = mss; + if (tg3_40bit_overflow_test(tp, mapping, len)) + would_hit_hwbug = 1; + + if (tg3_flag(tp, 5701_DMA_BUG)) + would_hit_hwbug = 1; + + tg3_set_txd(tnapi, entry, mapping, len, base_flags, + (skb_shinfo(skb)->nr_frags == 0) | (mss << 1)); - if (!tg3_flag(tp, HW_TSO_1) && - !tg3_flag(tp, HW_TSO_2) && - !tg3_flag(tp, HW_TSO_3)) - tmp_mss = 0; + entry = NEXT_TX(entry); + /* Now loop through additional data fragments, and queue them. */ + if (skb_shinfo(skb)->nr_frags > 0) { last = skb_shinfo(skb)->nr_frags - 1; for (i = 0; i <= last; i++) { skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; @@ -6291,25 +6209,39 @@ static netdev_tx_t tg3_start_xmit(struct sk_buff *skb, struct net_device *dev) if (pci_dma_mapping_error(tp->pdev, mapping)) goto dma_error; - if (tg3_tx_frag_set(tnapi, &entry, &budget, mapping, - len, base_flags | - ((i == last) ? TXD_FLAG_END : 0), - tmp_mss, vlan)) + if (tg3_flag(tp, SHORT_DMA_BUG) && + len <= 8) would_hit_hwbug = 1; + + if (tg3_4g_overflow_test(mapping, len)) + would_hit_hwbug = 1; + + if (tg3_40bit_overflow_test(tp, mapping, len)) + would_hit_hwbug = 1; + + if (tg3_flag(tp, HW_TSO_1) || + tg3_flag(tp, HW_TSO_2) || + tg3_flag(tp, HW_TSO_3)) + tg3_set_txd(tnapi, entry, mapping, len, + base_flags, (i == last)|(mss << 1)); + else + tg3_set_txd(tnapi, entry, mapping, len, + base_flags, (i == last)); + + entry = NEXT_TX(entry); } } if (would_hit_hwbug) { - tg3_tx_skb_unmap(tnapi, tnapi->tx_prod, i); + tg3_skb_error_unmap(tnapi, skb, i); /* If the workaround fails due to memory/mapping * failure, silently drop this packet. */ - entry = tnapi->tx_prod; - budget = tg3_tx_avail(tnapi); - if (tigon3_dma_hwbug_workaround(tnapi, skb, &entry, &budget, - base_flags, mss, vlan)) + if (tigon3_dma_hwbug_workaround(tnapi, skb, base_flags, mss)) goto out_unlock; + + entry = NEXT_TX(tnapi->tx_prod); } skb_tx_timestamp(skb); @@ -6337,7 +6269,7 @@ static netdev_tx_t tg3_start_xmit(struct sk_buff *skb, struct net_device *dev) return NETDEV_TX_OK; dma_error: - tg3_tx_skb_unmap(tnapi, tnapi->tx_prod, i); + tg3_skb_error_unmap(tnapi, skb, i); dev_kfree_skb(skb); tnapi->tx_buffers[tnapi->tx_prod].skb = NULL; return NETDEV_TX_OK; @@ -6670,13 +6602,35 @@ static void tg3_free_rings(struct tg3 *tp) if (!tnapi->tx_buffers) continue; - for (i = 0; i < TG3_TX_RING_SIZE; i++) { - struct sk_buff *skb = tnapi->tx_buffers[i].skb; + for (i = 0; i < TG3_TX_RING_SIZE; ) { + struct ring_info *txp; + struct sk_buff *skb; + unsigned int k; + + txp = &tnapi->tx_buffers[i]; + skb = txp->skb; - if (!skb) + if (skb == NULL) { + i++; continue; + } + + pci_unmap_single(tp->pdev, + dma_unmap_addr(txp, mapping), + skb_headlen(skb), + PCI_DMA_TODEVICE); + txp->skb = NULL; - tg3_tx_skb_unmap(tnapi, i, skb_shinfo(skb)->nr_frags); + i++; + + for (k = 0; k < skb_shinfo(skb)->nr_frags; k++) { + txp = &tnapi->tx_buffers[i & (TG3_TX_RING_SIZE - 1)]; + pci_unmap_page(tp->pdev, + dma_unmap_addr(txp, mapping), + skb_shinfo(skb)->frags[k].size, + PCI_DMA_TODEVICE); + i++; + } dev_kfree_skb_any(skb); } @@ -6808,9 +6762,9 @@ static int tg3_alloc_consistent(struct tg3 *tp) */ if ((!i && !tg3_flag(tp, ENABLE_TSS)) || (i && tg3_flag(tp, ENABLE_TSS))) { - tnapi->tx_buffers = kzalloc( - sizeof(struct tg3_tx_ring_info) * - TG3_TX_RING_SIZE, GFP_KERNEL); + tnapi->tx_buffers = kzalloc(sizeof(struct ring_info) * + TG3_TX_RING_SIZE, + GFP_KERNEL); if (!tnapi->tx_buffers) goto err_out; @@ -8406,7 +8360,7 @@ static int tg3_reset_hw(struct tg3 *tp, int reset_phy) /* Program the jumbo buffer descriptor ring control * blocks on those devices that have them. */ - if (tp->pci_chip_rev_id == CHIPREV_ID_5719_A0 || + if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5719 || (tg3_flag(tp, JUMBO_CAPABLE) && !tg3_flag(tp, 5780_CLASS))) { if (tg3_flag(tp, JUMBO_RING_ENABLE)) { @@ -11250,7 +11204,6 @@ static int tg3_run_loopback(struct tg3 *tp, u32 pktsz, int loopback_mode) { u32 mac_mode, rx_start_idx, rx_idx, tx_idx, opaque_key; u32 base_flags = 0, mss = 0, desc_idx, coal_now, data_off, val; - u32 budget; struct sk_buff *skb, *rx_skb; u8 *tx_data; dma_addr_t map; @@ -11410,10 +11363,6 @@ static int tg3_run_loopback(struct tg3 *tp, u32 pktsz, int loopback_mode) return -EIO; } - val = tnapi->tx_prod; - tnapi->tx_buffers[val].skb = skb; - dma_unmap_addr_set(&tnapi->tx_buffers[val], mapping, map); - tw32_f(HOSTCC_MODE, tp->coalesce_mode | HOSTCC_MODE_ENABLE | rnapi->coal_now); @@ -11421,13 +11370,8 @@ static int tg3_run_loopback(struct tg3 *tp, u32 pktsz, int loopback_mode) rx_start_idx = rnapi->hw_status->idx[0].rx_producer; - budget = tg3_tx_avail(tnapi); - if (tg3_tx_frag_set(tnapi, &val, &budget, map, tx_len, - base_flags | TXD_FLAG_END, mss, 0)) { - tnapi->tx_buffers[val].skb = NULL; - dev_kfree_skb(skb); - return -EIO; - } + tg3_set_txd(tnapi, tnapi->tx_prod, map, tx_len, + base_flags, (mss << 1) | 1); tnapi->tx_prod++; @@ -11450,7 +11394,7 @@ static int tg3_run_loopback(struct tg3 *tp, u32 pktsz, int loopback_mode) break; } - tg3_tx_skb_unmap(tnapi, tnapi->tx_prod - 1, 0); + pci_unmap_single(tp->pdev, map, tx_len, PCI_DMA_TODEVICE); dev_kfree_skb(skb); if (tx_idx != tnapi->tx_prod) @@ -13873,7 +13817,7 @@ static int __devinit tg3_get_invariants(struct tg3 *tp) tg3_flag_set(tp, 5705_PLUS); /* Determine TSO capabilities */ - if (tp->pci_chip_rev_id == CHIPREV_ID_5719_A0) + if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5719) ; /* Do nothing. HW bug. */ else if (tg3_flag(tp, 57765_PLUS)) tg3_flag_set(tp, HW_TSO_3); @@ -13936,14 +13880,11 @@ static int __devinit tg3_get_invariants(struct tg3 *tp) if (tg3_flag(tp, 5755_PLUS)) tg3_flag_set(tp, SHORT_DMA_BUG); - if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5719) - tg3_flag_set(tp, 4K_FIFO_LIMIT); - if (tg3_flag(tp, 5717_PLUS)) tg3_flag_set(tp, LRG_PROD_RING_CAP); if (tg3_flag(tp, 57765_PLUS) && - tp->pci_chip_rev_id != CHIPREV_ID_5719_A0) + GET_ASIC_REV(tp->pci_chip_rev_id) != ASIC_REV_5719) tg3_flag_set(tp, USE_JUMBO_BDFLAG); if (!tg3_flag(tp, 5705_PLUS) || diff --git a/trunk/drivers/net/tg3.h b/trunk/drivers/net/tg3.h index 2ea456dd5880..691539ba17b3 100644 --- a/trunk/drivers/net/tg3.h +++ b/trunk/drivers/net/tg3.h @@ -2652,12 +2652,6 @@ struct ring_info { DEFINE_DMA_UNMAP_ADDR(mapping); }; -struct tg3_tx_ring_info { - struct sk_buff *skb; - DEFINE_DMA_UNMAP_ADDR(mapping); - bool fragmented; -}; - struct tg3_link_config { /* Describes what we're trying to get. */ u32 advertising; @@ -2822,7 +2816,7 @@ struct tg3_napi { u32 last_tx_cons; u32 prodmbox; struct tg3_tx_buffer_desc *tx_ring; - struct tg3_tx_ring_info *tx_buffers; + struct ring_info *tx_buffers; dma_addr_t status_mapping; dma_addr_t rx_rcb_mapping; @@ -2905,7 +2899,6 @@ enum TG3_FLAGS { TG3_FLAG_57765_PLUS, TG3_FLAG_APE_HAS_NCSI, TG3_FLAG_5717_PLUS, - TG3_FLAG_4K_FIFO_LIMIT, /* Add new flags before this comment and TG3_FLAG_NUMBER_OF_FLAGS */ TG3_FLAG_NUMBER_OF_FLAGS, /* Last entry in enum TG3_FLAGS */ diff --git a/trunk/drivers/net/tun.c b/trunk/drivers/net/tun.c index 71f3d1a35b74..9a6b3824da14 100644 --- a/trunk/drivers/net/tun.c +++ b/trunk/drivers/net/tun.c @@ -528,7 +528,6 @@ static void tun_net_init(struct net_device *dev) dev->netdev_ops = &tap_netdev_ops; /* Ethernet TAP Device */ ether_setup(dev); - dev->priv_flags &= ~IFF_TX_SKB_SHARING; random_ether_addr(dev->dev_addr); diff --git a/trunk/drivers/net/usb/asix.c b/trunk/drivers/net/usb/asix.c index c5c4b4def7fb..52502883523e 100644 --- a/trunk/drivers/net/usb/asix.c +++ b/trunk/drivers/net/usb/asix.c @@ -314,11 +314,12 @@ static int asix_rx_fixup(struct usbnet *dev, struct sk_buff *skb) skb_pull(skb, 4); while (skb->len > 0) { - if ((header & 0x07ff) != ((~header >> 16) & 0x07ff)) + if ((short)(header & 0x0000ffff) != + ~((short)((header & 0xffff0000) >> 16))) { netdev_err(dev->net, "asix_rx_fixup() Bad Header Length\n"); - + } /* get the packet length */ - size = (u16) (header & 0x000007ff); + size = (u16) (header & 0x0000ffff); if ((skb->len) - ((size + 1) & 0xfffe) == 0) { u8 alignment = (unsigned long)skb->data & 0x3; diff --git a/trunk/drivers/net/veth.c b/trunk/drivers/net/veth.c index 5b23767ea817..7f78db7bd68d 100644 --- a/trunk/drivers/net/veth.c +++ b/trunk/drivers/net/veth.c @@ -263,8 +263,6 @@ static void veth_setup(struct net_device *dev) { ether_setup(dev); - dev->priv_flags &= ~IFF_TX_SKB_SHARING; - dev->netdev_ops = &veth_netdev_ops; dev->ethtool_ops = &veth_ethtool_ops; dev->features |= NETIF_F_LLTX; diff --git a/trunk/drivers/net/wan/hdlc_fr.c b/trunk/drivers/net/wan/hdlc_fr.c index eb2028187fbe..b25c9229a6a9 100644 --- a/trunk/drivers/net/wan/hdlc_fr.c +++ b/trunk/drivers/net/wan/hdlc_fr.c @@ -1074,10 +1074,9 @@ static int fr_add_pvc(struct net_device *frad, unsigned int dlci, int type) used = pvc_is_used(pvc); - if (type == ARPHRD_ETHER) { + if (type == ARPHRD_ETHER) dev = alloc_netdev(0, "pvceth%d", ether_setup); - dev->priv_flags &= ~IFF_TX_SKB_SHARING; - } else + else dev = alloc_netdev(0, "pvc%d", pvc_setup); if (!dev) { diff --git a/trunk/drivers/net/wireless/airo.c b/trunk/drivers/net/wireless/airo.c index e1b3e3c134fd..55cf71fbffe3 100644 --- a/trunk/drivers/net/wireless/airo.c +++ b/trunk/drivers/net/wireless/airo.c @@ -2823,7 +2823,6 @@ static struct net_device *_init_airo_card( unsigned short irq, int port, dev->wireless_data = &ai->wireless_data; dev->irq = irq; dev->base_addr = port; - dev->priv_flags &= ~IFF_TX_SKB_SHARING; SET_NETDEV_DEV(dev, dmdev); diff --git a/trunk/drivers/net/wireless/b43/Kconfig b/trunk/drivers/net/wireless/b43/Kconfig index 3cab843afb05..d2293dcc117f 100644 --- a/trunk/drivers/net/wireless/b43/Kconfig +++ b/trunk/drivers/net/wireless/b43/Kconfig @@ -28,7 +28,7 @@ config B43 config B43_BCMA bool "Support for BCMA bus" - depends on B43 && BCMA + depends on B43 && BCMA && BROKEN default y config B43_SSB diff --git a/trunk/drivers/net/wireless/b43/bus.c b/trunk/drivers/net/wireless/b43/bus.c index 05f6c7bff6ab..64c3f65ff8c0 100644 --- a/trunk/drivers/net/wireless/b43/bus.c +++ b/trunk/drivers/net/wireless/b43/bus.c @@ -244,12 +244,10 @@ void b43_bus_set_wldev(struct b43_bus_dev *dev, void *wldev) #ifdef CONFIG_B43_BCMA case B43_BUS_BCMA: bcma_set_drvdata(dev->bdev, wldev); - break; #endif #ifdef CONFIG_B43_SSB case B43_BUS_SSB: ssb_set_drvdata(dev->sdev, wldev); - break; #endif } } diff --git a/trunk/drivers/net/wireless/b43/main.c b/trunk/drivers/net/wireless/b43/main.c index 26f1ab840cc7..032d46674f6b 100644 --- a/trunk/drivers/net/wireless/b43/main.c +++ b/trunk/drivers/net/wireless/b43/main.c @@ -5350,7 +5350,6 @@ static void b43_ssb_remove(struct ssb_device *sdev) { struct b43_wl *wl = ssb_get_devtypedata(sdev); struct b43_wldev *wldev = ssb_get_drvdata(sdev); - struct b43_bus_dev *dev = wldev->dev; /* We must cancel any work here before unregistering from ieee80211, * as the ieee80211 unreg will destroy the workqueue. */ @@ -5366,14 +5365,14 @@ static void b43_ssb_remove(struct ssb_device *sdev) ieee80211_unregister_hw(wl->hw); } - b43_one_core_detach(dev); + b43_one_core_detach(wldev->dev); if (list_empty(&wl->devlist)) { b43_leds_unregister(wl); /* Last core on the chip unregistered. * We can destroy common struct b43_wl. */ - b43_wireless_exit(dev, wl); + b43_wireless_exit(wldev->dev, wl); } } diff --git a/trunk/drivers/net/wireless/hostap/hostap_main.c b/trunk/drivers/net/wireless/hostap/hostap_main.c index 89a116fba1de..d5084829c9e5 100644 --- a/trunk/drivers/net/wireless/hostap/hostap_main.c +++ b/trunk/drivers/net/wireless/hostap/hostap_main.c @@ -855,7 +855,6 @@ void hostap_setup_dev(struct net_device *dev, local_info_t *local, iface = netdev_priv(dev); ether_setup(dev); - dev->priv_flags &= ~IFF_TX_SKB_SHARING; /* kernel callbacks */ if (iface) { diff --git a/trunk/drivers/nfc/pn533.c b/trunk/drivers/nfc/pn533.c index c77e0543e502..037231540719 100644 --- a/trunk/drivers/nfc/pn533.c +++ b/trunk/drivers/nfc/pn533.c @@ -1596,7 +1596,7 @@ static void pn533_disconnect(struct usb_interface *interface) usb_free_urb(dev->out_urb); kfree(dev); - nfc_dev_info(&interface->dev, "NXP PN533 NFC device disconnected"); + nfc_dev_info(&dev->interface->dev, "NXP PN533 NFC device disconnected"); } static struct usb_driver pn533_driver = { diff --git a/trunk/drivers/staging/ath6kl/os/linux/ar6000_drv.c b/trunk/drivers/staging/ath6kl/os/linux/ar6000_drv.c index 32ee39ad00df..499b7a90e941 100644 --- a/trunk/drivers/staging/ath6kl/os/linux/ar6000_drv.c +++ b/trunk/drivers/staging/ath6kl/os/linux/ar6000_drv.c @@ -6205,7 +6205,6 @@ int ar6000_create_ap_interface(struct ar6_softc *ar, char *ap_ifname) ether_setup(dev); init_netdev(dev, ap_ifname); - dev->priv_flags &= ~IFF_TX_SKB_SHARING; if (register_netdev(dev)) { AR_DEBUG_PRINTF(ATH_DEBUG_ERR,("ar6000_create_ap_interface: register_netdev failed\n")); diff --git a/trunk/include/linux/if.h b/trunk/include/linux/if.h index 03489ca92ded..3bc63e6a02f7 100644 --- a/trunk/include/linux/if.h +++ b/trunk/include/linux/if.h @@ -76,8 +76,6 @@ #define IFF_BRIDGE_PORT 0x4000 /* device used as bridge port */ #define IFF_OVS_DATAPATH 0x8000 /* device used as Open vSwitch * datapath port */ -#define IFF_TX_SKB_SHARING 0x10000 /* The interface supports sharing - * skbs on transmit */ #define IF_GET_IFACE 0x0001 /* for querying only */ #define IF_GET_PROTO 0x0002 diff --git a/trunk/include/linux/netdevice.h b/trunk/include/linux/netdevice.h index ddee79bb8f15..2ed0b6cf11c5 100644 --- a/trunk/include/linux/netdevice.h +++ b/trunk/include/linux/netdevice.h @@ -1132,7 +1132,7 @@ struct net_device { spinlock_t addr_list_lock; struct netdev_hw_addr_list uc; /* Unicast mac addresses */ struct netdev_hw_addr_list mc; /* Multicast mac addresses */ - bool uc_promisc; + int uc_promisc; unsigned int promiscuity; unsigned int allmulti; @@ -1679,12 +1679,9 @@ static inline int skb_gro_header_hard(struct sk_buff *skb, unsigned int hlen) static inline void *skb_gro_header_slow(struct sk_buff *skb, unsigned int hlen, unsigned int offset) { - if (!pskb_may_pull(skb, hlen)) - return NULL; - NAPI_GRO_CB(skb)->frag0 = NULL; NAPI_GRO_CB(skb)->frag0_len = 0; - return skb->data + offset; + return pskb_may_pull(skb, hlen) ? skb->data + offset : NULL; } static inline void *skb_gro_mac_header(struct sk_buff *skb) diff --git a/trunk/include/linux/raid/md_p.h b/trunk/include/linux/raid/md_p.h index 9e65d9e20662..75cbf4f62fe8 100644 --- a/trunk/include/linux/raid/md_p.h +++ b/trunk/include/linux/raid/md_p.h @@ -245,16 +245,10 @@ struct mdp_superblock_1 { __u8 device_uuid[16]; /* user-space setable, ignored by kernel */ __u8 devflags; /* per-device flags. Only one defined...*/ #define WriteMostly1 1 /* mask for writemostly flag in above */ - /* Bad block log. If there are any bad blocks the feature flag is set. - * If offset and size are non-zero, that space is reserved and available - */ - __u8 bblog_shift; /* shift from sectors to block size */ - __le16 bblog_size; /* number of sectors reserved for list */ - __le32 bblog_offset; /* sector offset from superblock to bblog, - * signed - not unsigned */ + __u8 pad2[64-57]; /* set to 0 when writing */ /* array state information - 64 bytes */ - __le64 utime; /* 40 bits second, 24 bits microseconds */ + __le64 utime; /* 40 bits second, 24 btes microseconds */ __le64 events; /* incremented when superblock updated */ __le64 resync_offset; /* data before this offset (from data_offset) known to be in sync */ __le32 sb_csum; /* checksum up to devs[max_dev] */ @@ -276,8 +270,8 @@ struct mdp_superblock_1 { * must be honoured */ #define MD_FEATURE_RESHAPE_ACTIVE 4 -#define MD_FEATURE_BAD_BLOCKS 8 /* badblock list is not empty */ -#define MD_FEATURE_ALL (1|2|4|8) +#define MD_FEATURE_ALL (1|2|4) #endif + diff --git a/trunk/net/8021q/vlan_dev.c b/trunk/net/8021q/vlan_dev.c index 9d40a071d038..934e221c1d07 100644 --- a/trunk/net/8021q/vlan_dev.c +++ b/trunk/net/8021q/vlan_dev.c @@ -695,7 +695,7 @@ void vlan_setup(struct net_device *dev) ether_setup(dev); dev->priv_flags |= IFF_802_1Q_VLAN; - dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING); + dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; dev->tx_queue_len = 0; dev->netdev_ops = &vlan_netdev_ops; diff --git a/trunk/net/bluetooth/bnep/netdev.c b/trunk/net/bluetooth/bnep/netdev.c index d4f5dff7c955..8c100c9dae28 100644 --- a/trunk/net/bluetooth/bnep/netdev.c +++ b/trunk/net/bluetooth/bnep/netdev.c @@ -231,7 +231,6 @@ void bnep_net_setup(struct net_device *dev) dev->addr_len = ETH_ALEN; ether_setup(dev); - dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->netdev_ops = &bnep_netdev_ops; dev->watchdog_timeo = HZ * 2; diff --git a/trunk/net/core/dev.c b/trunk/net/core/dev.c index 17d67b579beb..9444c5cb4137 100644 --- a/trunk/net/core/dev.c +++ b/trunk/net/core/dev.c @@ -4497,10 +4497,10 @@ void __dev_set_rx_mode(struct net_device *dev) */ if (!netdev_uc_empty(dev) && !dev->uc_promisc) { __dev_set_promiscuity(dev, 1); - dev->uc_promisc = true; + dev->uc_promisc = 1; } else if (netdev_uc_empty(dev) && dev->uc_promisc) { __dev_set_promiscuity(dev, -1); - dev->uc_promisc = false; + dev->uc_promisc = 0; } if (ops->ndo_set_multicast_list) diff --git a/trunk/net/core/pktgen.c b/trunk/net/core/pktgen.c index e35a6fbb8110..f76079cd750c 100644 --- a/trunk/net/core/pktgen.c +++ b/trunk/net/core/pktgen.c @@ -1070,9 +1070,7 @@ static ssize_t pktgen_if_write(struct file *file, len = num_arg(&user_buffer[i], 10, &value); if (len < 0) return len; - if ((value > 0) && - (!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING))) - return -ENOTSUPP; + i += len; pkt_dev->clone_skb = value; @@ -3557,6 +3555,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) pkt_dev->min_pkt_size = ETH_ZLEN; pkt_dev->max_pkt_size = ETH_ZLEN; pkt_dev->nfrags = 0; + pkt_dev->clone_skb = pg_clone_skb_d; pkt_dev->delay = pg_delay_d; pkt_dev->count = pg_count_d; pkt_dev->sofar = 0; @@ -3564,6 +3563,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) pkt_dev->udp_src_max = 9; pkt_dev->udp_dst_min = 9; pkt_dev->udp_dst_max = 9; + pkt_dev->vlan_p = 0; pkt_dev->vlan_cfi = 0; pkt_dev->vlan_id = 0xffff; @@ -3575,8 +3575,6 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) err = pktgen_setup_dev(pkt_dev, ifname); if (err) goto out1; - if (pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING) - pkt_dev->clone_skb = pg_clone_skb_d; pkt_dev->entry = proc_create_data(ifname, 0600, pg_proc_dir, &pktgen_if_fops, pkt_dev); diff --git a/trunk/net/ethernet/eth.c b/trunk/net/ethernet/eth.c index 27997d35ebd3..5cffb63f481a 100644 --- a/trunk/net/ethernet/eth.c +++ b/trunk/net/ethernet/eth.c @@ -231,7 +231,6 @@ EXPORT_SYMBOL(eth_header_parse); * eth_header_cache - fill cache entry from neighbour * @neigh: source neighbour * @hh: destination cache entry - * @type: Ethernet type field * Create an Ethernet header template from the neighbour. */ int eth_header_cache(const struct neighbour *neigh, struct hh_cache *hh, __be16 type) @@ -340,7 +339,6 @@ void ether_setup(struct net_device *dev) dev->addr_len = ETH_ALEN; dev->tx_queue_len = 1000; /* Ethernet wants good queues */ dev->flags = IFF_BROADCAST|IFF_MULTICAST; - dev->priv_flags = IFF_TX_SKB_SHARING; memset(dev->broadcast, 0xFF, ETH_ALEN); diff --git a/trunk/net/ipv4/devinet.c b/trunk/net/ipv4/devinet.c index bc19bd06dd00..37b3c188d8b3 100644 --- a/trunk/net/ipv4/devinet.c +++ b/trunk/net/ipv4/devinet.c @@ -1134,15 +1134,15 @@ static void inetdev_send_gratuitous_arp(struct net_device *dev, struct in_device *in_dev) { - struct in_ifaddr *ifa; + struct in_ifaddr *ifa = in_dev->ifa_list; - for (ifa = in_dev->ifa_list; ifa; - ifa = ifa->ifa_next) { - arp_send(ARPOP_REQUEST, ETH_P_ARP, - ifa->ifa_local, dev, - ifa->ifa_local, NULL, - dev->dev_addr, NULL); - } + if (!ifa) + return; + + arp_send(ARPOP_REQUEST, ETH_P_ARP, + ifa->ifa_local, dev, + ifa->ifa_local, NULL, + dev->dev_addr, NULL); } /* Called only under RTNL semaphore */ diff --git a/trunk/net/ipv6/addrconf.c b/trunk/net/ipv6/addrconf.c index a55500cc0b29..a06c53c14d84 100644 --- a/trunk/net/ipv6/addrconf.c +++ b/trunk/net/ipv6/addrconf.c @@ -1481,8 +1481,6 @@ static void addrconf_join_anycast(struct inet6_ifaddr *ifp) static void addrconf_leave_anycast(struct inet6_ifaddr *ifp) { struct in6_addr addr; - if (ifp->prefix_len == 127) /* RFC 6164 */ - return; ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len); if (ipv6_addr_any(&addr)) return; diff --git a/trunk/net/l2tp/l2tp_eth.c b/trunk/net/l2tp/l2tp_eth.c index d2726a74597d..a8193f52c13c 100644 --- a/trunk/net/l2tp/l2tp_eth.c +++ b/trunk/net/l2tp/l2tp_eth.c @@ -103,7 +103,7 @@ static struct net_device_ops l2tp_eth_netdev_ops = { static void l2tp_eth_dev_setup(struct net_device *dev) { ether_setup(dev); - dev->priv_flags &= ~IFF_TX_SKB_SHARING; + dev->netdev_ops = &l2tp_eth_netdev_ops; dev->destructor = free_netdev; } diff --git a/trunk/net/mac80211/iface.c b/trunk/net/mac80211/iface.c index 556e7e6ddf0a..cd5fb40d3fd4 100644 --- a/trunk/net/mac80211/iface.c +++ b/trunk/net/mac80211/iface.c @@ -698,7 +698,6 @@ static const struct net_device_ops ieee80211_monitorif_ops = { static void ieee80211_if_setup(struct net_device *dev) { ether_setup(dev); - dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->netdev_ops = &ieee80211_dataif_ops; dev->destructor = free_netdev; } diff --git a/trunk/net/socket.c b/trunk/net/socket.c index b1cbbcd92558..26ed35c7751e 100644 --- a/trunk/net/socket.c +++ b/trunk/net/socket.c @@ -580,7 +580,7 @@ int sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) } EXPORT_SYMBOL(sock_sendmsg); -static int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg, size_t size) +int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg, size_t size) { struct kiocb iocb; struct sock_iocb siocb; diff --git a/trunk/net/wireless/reg.c b/trunk/net/wireless/reg.c index 02751dbc5a97..1ad0f39fe091 100644 --- a/trunk/net/wireless/reg.c +++ b/trunk/net/wireless/reg.c @@ -903,7 +903,7 @@ static bool ignore_reg_update(struct wiphy *wiphy, initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE && !is_world_regdom(last_request->alpha2)) { REG_DBG_PRINT("Ignoring regulatory request %s " - "since the driver requires its own regulatory " + "since the driver requires its own regulaotry " "domain to be set first", reg_initiator_name(initiator)); return true; @@ -1125,13 +1125,12 @@ void wiphy_update_regulatory(struct wiphy *wiphy, enum ieee80211_band band; if (ignore_reg_update(wiphy, initiator)) - return; - + goto out; for (band = 0; band < IEEE80211_NUM_BANDS; band++) { if (wiphy->bands[band]) handle_band(wiphy, band, initiator); } - +out: reg_process_beacons(wiphy); reg_process_ht_flags(wiphy); if (wiphy->reg_notifier) diff --git a/trunk/sound/oss/ad1848.c b/trunk/sound/oss/ad1848.c index 8a197fd3c57e..4d2a6ae978f7 100644 --- a/trunk/sound/oss/ad1848.c +++ b/trunk/sound/oss/ad1848.c @@ -458,7 +458,7 @@ static int ad1848_set_recmask(ad1848_info * devc, int mask) return mask; } -static void oss_change_bits(ad1848_info *devc, unsigned char *regval, +static void change_bits(ad1848_info * devc, unsigned char *regval, unsigned char *muteval, int dev, int chn, int newval) { unsigned char mask; @@ -516,10 +516,10 @@ static void ad1848_mixer_set_channel(ad1848_info *devc, int dev, int value, int if (muteregoffs != regoffs) { muteval = ad_read(devc, muteregoffs); - oss_change_bits(devc, &val, &muteval, dev, channel, value); + change_bits(devc, &val, &muteval, dev, channel, value); } else - oss_change_bits(devc, &val, &val, dev, channel, value); + change_bits(devc, &val, &val, dev, channel, value); spin_lock_irqsave(&devc->lock,flags); ad_write(devc, regoffs, val); diff --git a/trunk/sound/oss/sb_mixer.c b/trunk/sound/oss/sb_mixer.c index f8f3b7a66b73..2039d31b7e22 100644 --- a/trunk/sound/oss/sb_mixer.c +++ b/trunk/sound/oss/sb_mixer.c @@ -232,7 +232,7 @@ static int detect_mixer(sb_devc * devc) return 1; } -static void oss_change_bits(sb_devc *devc, unsigned char *regval, int dev, int chn, int newval) +static void change_bits(sb_devc * devc, unsigned char *regval, int dev, int chn, int newval) { unsigned char mask; int shift; @@ -284,7 +284,7 @@ int sb_common_mixer_set(sb_devc * devc, int dev, int left, int right) return -EINVAL; val = sb_getmixer(devc, regoffs); - oss_change_bits(devc, &val, dev, LEFT_CHN, left); + change_bits(devc, &val, dev, LEFT_CHN, left); if ((*devc->iomap)[dev][RIGHT_CHN].regno != regoffs) /* * Change register @@ -304,7 +304,7 @@ int sb_common_mixer_set(sb_devc * devc, int dev, int left, int right) * Read the new one */ } - oss_change_bits(devc, &val, dev, RIGHT_CHN, right); + change_bits(devc, &val, dev, RIGHT_CHN, right); sb_setmixer(devc, regoffs, val); diff --git a/trunk/sound/pci/asihpi/hpioctl.c b/trunk/sound/pci/asihpi/hpioctl.c index 9683f84ecdc8..65fcf4770731 100644 --- a/trunk/sound/pci/asihpi/hpioctl.c +++ b/trunk/sound/pci/asihpi/hpioctl.c @@ -107,6 +107,7 @@ long asihpi_hpi_ioctl(struct file *file, unsigned int cmd, unsigned long arg) union hpi_response_buffer_v1 *hr; u16 res_max_size; u32 uncopied_bytes; + struct hpi_adapter *pa = NULL; int err = 0; if (cmd != HPI_IOCTL_LINUX) @@ -181,9 +182,8 @@ long asihpi_hpi_ioctl(struct file *file, unsigned int cmd, unsigned long arg) /* -1=no data 0=read from user mem, 1=write to user mem */ int wrflag = -1; u32 adapter = hm->h.adapter_index; - struct hpi_adapter *pa = &adapters[adapter]; - if ((adapter >= HPI_MAX_ADAPTERS) || (!pa->type)) { + if ((adapter > HPI_MAX_ADAPTERS) || (!pa->type)) { hpi_init_response(&hr->r0, HPI_OBJ_ADAPTER, HPI_ADAPTER_OPEN, HPI_ERROR_BAD_ADAPTER_NUMBER); @@ -197,7 +197,9 @@ long asihpi_hpi_ioctl(struct file *file, unsigned int cmd, unsigned long arg) goto out; } - if (mutex_lock_interruptible(&pa->mutex)) { + pa = &adapters[adapter]; + + if (mutex_lock_interruptible(&adapters[adapter].mutex)) { err = -EINTR; goto out; } @@ -233,7 +235,8 @@ long asihpi_hpi_ioctl(struct file *file, unsigned int cmd, unsigned long arg) "stream buffer size %d\n", size); - mutex_unlock(&pa->mutex); + mutex_unlock(&adapters + [adapter].mutex); err = -EINVAL; goto out; } @@ -274,7 +277,7 @@ long asihpi_hpi_ioctl(struct file *file, unsigned int cmd, unsigned long arg) uncopied_bytes, size); } - mutex_unlock(&pa->mutex); + mutex_unlock(&adapters[adapter].mutex); } /* on return response size must be set */ diff --git a/trunk/sound/pci/hda/patch_realtek.c b/trunk/sound/pci/hda/patch_realtek.c index e125c60fe352..694327ae8b71 100644 --- a/trunk/sound/pci/hda/patch_realtek.c +++ b/trunk/sound/pci/hda/patch_realtek.c @@ -895,15 +895,13 @@ static void alc_init_auto_hp(struct hda_codec *codec) if (present == 3) spec->automute_hp_lo = 1; /* both HP and LO automute */ - if (!cfg->speaker_pins[0] && - cfg->line_out_type == AUTO_PIN_SPEAKER_OUT) { + if (!cfg->speaker_pins[0]) { memcpy(cfg->speaker_pins, cfg->line_out_pins, sizeof(cfg->speaker_pins)); cfg->speaker_outs = cfg->line_outs; } - if (!cfg->hp_pins[0] && - cfg->line_out_type == AUTO_PIN_HP_OUT) { + if (!cfg->hp_pins[0]) { memcpy(cfg->hp_pins, cfg->line_out_pins, sizeof(cfg->hp_pins)); cfg->hp_outs = cfg->line_outs; @@ -922,7 +920,6 @@ static void alc_init_auto_hp(struct hda_codec *codec) spec->automute_mode = ALC_AUTOMUTE_PIN; } if (spec->automute && cfg->line_out_pins[0] && - cfg->speaker_pins[0] && cfg->line_out_pins[0] != cfg->hp_pins[0] && cfg->line_out_pins[0] != cfg->speaker_pins[0]) { for (i = 0; i < cfg->line_outs; i++) { @@ -1914,7 +1911,7 @@ static int alc_build_controls(struct hda_codec *codec) return err; } } - if (spec->cap_mixer && spec->adc_nids) { + if (spec->cap_mixer) { const char *kname = kctl ? kctl->id.name : NULL; for (knew = spec->cap_mixer; knew->name; knew++) { if (kname && strcmp(knew->name, kname) == 0) @@ -3680,7 +3677,7 @@ static int patch_alc880(struct hda_codec *codec) if (board_config != ALC_MODEL_AUTO) setup_preset(codec, &alc880_presets[board_config]); - if (!spec->no_analog && !spec->adc_nids) { + if (!spec->no_analog && !spec->adc_nids && spec->input_mux) { alc_auto_fill_adc_caps(codec); alc_rebuild_imux_for_auto_mic(codec); alc_remove_invalid_adc_nids(codec); @@ -3807,7 +3804,7 @@ static int patch_alc260(struct hda_codec *codec) if (board_config != ALC_MODEL_AUTO) setup_preset(codec, &alc260_presets[board_config]); - if (!spec->no_analog && !spec->adc_nids) { + if (!spec->no_analog && !spec->adc_nids && spec->input_mux) { alc_auto_fill_adc_caps(codec); alc_rebuild_imux_for_auto_mic(codec); alc_remove_invalid_adc_nids(codec); @@ -3986,7 +3983,7 @@ static int patch_alc882(struct hda_codec *codec) if (board_config != ALC_MODEL_AUTO) setup_preset(codec, &alc882_presets[board_config]); - if (!spec->no_analog && !spec->adc_nids) { + if (!spec->no_analog && !spec->adc_nids && spec->input_mux) { alc_auto_fill_adc_caps(codec); alc_rebuild_imux_for_auto_mic(codec); alc_remove_invalid_adc_nids(codec); @@ -4140,7 +4137,7 @@ static int patch_alc262(struct hda_codec *codec) if (board_config != ALC_MODEL_AUTO) setup_preset(codec, &alc262_presets[board_config]); - if (!spec->no_analog && !spec->adc_nids) { + if (!spec->no_analog && !spec->adc_nids && spec->input_mux) { alc_auto_fill_adc_caps(codec); alc_rebuild_imux_for_auto_mic(codec); alc_remove_invalid_adc_nids(codec); @@ -4296,7 +4293,7 @@ static int patch_alc268(struct hda_codec *codec) (0 << AC_AMPCAP_MUTE_SHIFT)); } - if (!spec->no_analog && !spec->adc_nids) { + if (!spec->no_analog && !spec->adc_nids && spec->input_mux) { alc_auto_fill_adc_caps(codec); alc_rebuild_imux_for_auto_mic(codec); alc_remove_invalid_adc_nids(codec); @@ -4708,7 +4705,7 @@ static int patch_alc269(struct hda_codec *codec) if (board_config != ALC_MODEL_AUTO) setup_preset(codec, &alc269_presets[board_config]); - if (!spec->no_analog && !spec->adc_nids) { + if (!spec->no_analog && !spec->adc_nids && spec->input_mux) { alc_auto_fill_adc_caps(codec); alc_rebuild_imux_for_auto_mic(codec); alc_remove_invalid_adc_nids(codec); @@ -4846,7 +4843,7 @@ static int patch_alc861(struct hda_codec *codec) if (board_config != ALC_MODEL_AUTO) setup_preset(codec, &alc861_presets[board_config]); - if (!spec->no_analog && !spec->adc_nids) { + if (!spec->no_analog && !spec->adc_nids && spec->input_mux) { alc_auto_fill_adc_caps(codec); alc_rebuild_imux_for_auto_mic(codec); alc_remove_invalid_adc_nids(codec); @@ -4987,7 +4984,7 @@ static int patch_alc861vd(struct hda_codec *codec) add_verb(spec, alc660vd_eapd_verbs); } - if (!spec->no_analog && !spec->adc_nids) { + if (!spec->no_analog && !spec->adc_nids && spec->input_mux) { alc_auto_fill_adc_caps(codec); alc_rebuild_imux_for_auto_mic(codec); alc_remove_invalid_adc_nids(codec); @@ -5203,7 +5200,7 @@ static int patch_alc662(struct hda_codec *codec) if (board_config != ALC_MODEL_AUTO) setup_preset(codec, &alc662_presets[board_config]); - if (!spec->no_analog && !spec->adc_nids) { + if (!spec->no_analog && !spec->adc_nids && spec->input_mux) { alc_auto_fill_adc_caps(codec); alc_rebuild_imux_for_auto_mic(codec); alc_remove_invalid_adc_nids(codec); @@ -5339,7 +5336,7 @@ static int patch_alc680(struct hda_codec *codec) #endif } - if (!spec->no_analog && !spec->adc_nids) { + if (!spec->no_analog && !spec->adc_nids && spec->input_mux) { alc_auto_fill_adc_caps(codec); alc_rebuild_imux_for_auto_mic(codec); alc_remove_invalid_adc_nids(codec); diff --git a/trunk/sound/pci/hda/patch_sigmatel.c b/trunk/sound/pci/hda/patch_sigmatel.c index aa376b59c006..fcf4c7142103 100644 --- a/trunk/sound/pci/hda/patch_sigmatel.c +++ b/trunk/sound/pci/hda/patch_sigmatel.c @@ -213,7 +213,6 @@ struct sigmatel_spec { unsigned int gpio_mute; unsigned int gpio_led; unsigned int gpio_led_polarity; - unsigned int vref_led; /* stream */ unsigned int stream_delay; @@ -673,30 +672,6 @@ static int stac92xx_smux_enum_put(struct snd_kcontrol *kcontrol, return 0; } -static int stac_vrefout_set(struct hda_codec *codec, - hda_nid_t nid, unsigned int new_vref) -{ - int error, pinctl; - - snd_printdd("%s, nid %x ctl %x\n", __func__, nid, new_vref); - pinctl = snd_hda_codec_read(codec, nid, 0, - AC_VERB_GET_PIN_WIDGET_CONTROL, 0); - - if (pinctl < 0) - return pinctl; - - pinctl &= 0xff; - pinctl &= ~AC_PINCTL_VREFEN; - pinctl |= (new_vref & AC_PINCTL_VREFEN); - - error = snd_hda_codec_write_cache(codec, nid, 0, - AC_VERB_SET_PIN_WIDGET_CONTROL, pinctl); - if (error < 0) - return error; - - return 1; -} - static unsigned int stac92xx_vref_set(struct hda_codec *codec, hda_nid_t nid, unsigned int new_vref) { @@ -4094,8 +4069,6 @@ static void stac_gpio_set(struct hda_codec *codec, unsigned int mask, { unsigned int gpiostate, gpiomask, gpiodir; - snd_printdd("%s msk %x dir %x gpio %x\n", __func__, mask, dir_mask, data); - gpiostate = snd_hda_codec_read(codec, codec->afg, 0, AC_VERB_GET_GPIO_DATA, 0); gpiostate = (gpiostate & ~dir_mask) | (data & dir_mask); @@ -4285,12 +4258,10 @@ static void stac_store_hints(struct hda_codec *codec) spec->eapd_switch = val; get_int_hint(codec, "gpio_led_polarity", &spec->gpio_led_polarity); if (get_int_hint(codec, "gpio_led", &spec->gpio_led)) { - if (spec->gpio_led <= 8) { - spec->gpio_mask |= spec->gpio_led; - spec->gpio_dir |= spec->gpio_led; - if (spec->gpio_led_polarity) - spec->gpio_data |= spec->gpio_led; - } + spec->gpio_mask |= spec->gpio_led; + spec->gpio_dir |= spec->gpio_led; + if (spec->gpio_led_polarity) + spec->gpio_data |= spec->gpio_led; } } @@ -4460,26 +4431,11 @@ static void stac92xx_free_kctls(struct hda_codec *codec) snd_array_free(&spec->kctls); } -static void stac92xx_shutup_pins(struct hda_codec *codec) -{ - unsigned int i, def_conf; - - if (codec->bus->shutdown) - return; - for (i = 0; i < codec->init_pins.used; i++) { - struct hda_pincfg *pin = snd_array_elem(&codec->init_pins, i); - def_conf = snd_hda_codec_get_pincfg(codec, pin->nid); - if (get_defcfg_connect(def_conf) != AC_JACK_PORT_NONE) - snd_hda_codec_write(codec, pin->nid, 0, - AC_VERB_SET_PIN_WIDGET_CONTROL, 0); - } -} - static void stac92xx_shutup(struct hda_codec *codec) { struct sigmatel_spec *spec = codec->spec; - stac92xx_shutup_pins(codec); + snd_hda_shutup_pins(codec); if (spec->eapd_mask) stac_gpio_set(codec, spec->gpio_mask, @@ -4877,11 +4833,10 @@ static int find_mute_led_gpio(struct hda_codec *codec, int default_polarity) if ((codec->subsystem_id >> 16) == PCI_VENDOR_ID_HP) { while ((dev = dmi_find_device(DMI_DEV_TYPE_OEM_STRING, NULL, dev))) { - if (sscanf(dev->name, "HP_Mute_LED_%d_%x", + if (sscanf(dev->name, "HP_Mute_LED_%d_%d", &spec->gpio_led_polarity, &spec->gpio_led) == 2) { - if (spec->gpio_led < 4) - spec->gpio_led = 1 << spec->gpio_led; + spec->gpio_led = 1 << spec->gpio_led; return 1; } if (sscanf(dev->name, "HP_Mute_LED_%d", @@ -4980,6 +4935,17 @@ static void stac927x_proc_hook(struct snd_info_buffer *buffer, #endif #ifdef CONFIG_PM +static int stac92xx_pre_resume(struct hda_codec *codec) +{ + struct sigmatel_spec *spec = codec->spec; + + /* sync mute LED */ + if (spec->gpio_led) + stac_gpio_set(codec, spec->gpio_mask, + spec->gpio_dir, spec->gpio_data); + return 0; +} + static int stac92xx_resume(struct hda_codec *codec) { struct sigmatel_spec *spec = codec->spec; @@ -4998,65 +4964,7 @@ static int stac92xx_resume(struct hda_codec *codec) return 0; } -static int stac92xx_suspend(struct hda_codec *codec, pm_message_t state) -{ - stac92xx_shutup(codec); - return 0; -} - #ifdef CONFIG_SND_HDA_POWER_SAVE -static int stac92xx_pre_resume(struct hda_codec *codec) -{ - struct sigmatel_spec *spec = codec->spec; - - /* sync mute LED */ - if (spec->gpio_led) { - if (spec->gpio_led <= 8) { - stac_gpio_set(codec, spec->gpio_mask, - spec->gpio_dir, spec->gpio_data); - } else { - stac_vrefout_set(codec, - spec->gpio_led, spec->vref_led); - } - } - return 0; -} - -static int stac92xx_post_suspend(struct hda_codec *codec) -{ - struct sigmatel_spec *spec = codec->spec; - if (spec->gpio_led > 8) { - /* with vref-out pin used for mute led control - * codec AFG is prevented from D3 state, but on - * system suspend it can (and should) be used - */ - snd_hda_codec_read(codec, codec->afg, 0, - AC_VERB_SET_POWER_STATE, AC_PWRST_D3); - } - return 0; -} - -static void stac92xx_set_power_state(struct hda_codec *codec, hda_nid_t fg, - unsigned int power_state) -{ - unsigned int afg_power_state = power_state; - struct sigmatel_spec *spec = codec->spec; - - if (power_state == AC_PWRST_D3) { - if (spec->gpio_led > 8) { - /* with vref-out pin used for mute led control - * codec AFG is prevented from D3 state - */ - afg_power_state = AC_PWRST_D1; - } - /* this delay seems necessary to avoid click noise at power-down */ - msleep(100); - } - snd_hda_codec_read(codec, fg, 0, AC_VERB_SET_POWER_STATE, - afg_power_state); - snd_hda_codec_set_power_to_all(codec, fg, power_state, true); -} - /* * For this feature CONFIG_SND_HDA_POWER_SAVE is needed * as mute LED state is updated in check_power_status hook @@ -5065,12 +4973,8 @@ static int stac92xx_update_led_status(struct hda_codec *codec) { struct sigmatel_spec *spec = codec->spec; int i, num_ext_dacs, muted = 1; - unsigned int muted_lvl, notmtd_lvl; hda_nid_t nid; - if (!spec->gpio_led) - return 0; - for (i = 0; i < spec->multiout.num_dacs; i++) { nid = spec->multiout.dac_nids[i]; if (!(snd_hda_codec_amp_read(codec, nid, 0, HDA_OUTPUT, 0) & @@ -5095,27 +4999,17 @@ static int stac92xx_update_led_status(struct hda_codec *codec) muted = 0; /* extra output is not muted */ } } - /*polarity defines *not* muted state level*/ - if (spec->gpio_led <= 8) { - if (muted) - spec->gpio_data &= ~spec->gpio_led; /* orange */ - else - spec->gpio_data |= spec->gpio_led; /* white */ + if (muted) + spec->gpio_data &= ~spec->gpio_led; /* orange */ + else + spec->gpio_data |= spec->gpio_led; /* white */ - if (!spec->gpio_led_polarity) { - /* LED state is inverted on these systems */ - spec->gpio_data ^= spec->gpio_led; - } - stac_gpio_set(codec, spec->gpio_mask, - spec->gpio_dir, spec->gpio_data); - } else { - notmtd_lvl = spec->gpio_led_polarity ? - AC_PINCTL_VREF_HIZ : AC_PINCTL_VREF_GRD; - muted_lvl = spec->gpio_led_polarity ? - AC_PINCTL_VREF_GRD : AC_PINCTL_VREF_HIZ; - spec->vref_led = muted ? muted_lvl : notmtd_lvl; - stac_vrefout_set(codec, spec->gpio_led, spec->vref_led); + if (!spec->gpio_led_polarity) { + /* LED state is inverted on these systems */ + spec->gpio_data ^= spec->gpio_led; } + + stac_gpio_set(codec, spec->gpio_mask, spec->gpio_dir, spec->gpio_data); return 0; } @@ -5129,7 +5023,13 @@ static int stac92xx_check_power_status(struct hda_codec *codec, return 0; } -#endif /* CONFIG_SND_HDA_POWER_SAVE */ +#endif + +static int stac92xx_suspend(struct hda_codec *codec, pm_message_t state) +{ + stac92xx_shutup(codec); + return 0; +} #endif /* CONFIG_PM */ static const struct hda_codec_ops stac92xx_patch_ops = { @@ -5141,6 +5041,7 @@ static const struct hda_codec_ops stac92xx_patch_ops = { #ifdef CONFIG_PM .suspend = stac92xx_suspend, .resume = stac92xx_resume, + .pre_resume = stac92xx_pre_resume, #endif .reboot_notify = stac92xx_shutup, }; @@ -5654,17 +5555,10 @@ static int patch_stac92hd83xxx(struct hda_codec *codec) #ifdef CONFIG_SND_HDA_POWER_SAVE if (spec->gpio_led) { - if (spec->gpio_led <= 8) { - spec->gpio_mask |= spec->gpio_led; - spec->gpio_dir |= spec->gpio_led; - spec->gpio_data |= spec->gpio_led; - } else { - codec->patch_ops.set_power_state = - stac92xx_set_power_state; - codec->patch_ops.post_suspend = - stac92xx_post_suspend; - } - codec->patch_ops.pre_resume = stac92xx_pre_resume; + spec->gpio_mask |= spec->gpio_led; + spec->gpio_dir |= spec->gpio_led; + spec->gpio_data |= spec->gpio_led; + /* register check_power_status callback. */ codec->patch_ops.check_power_status = stac92xx_check_power_status; } @@ -5989,17 +5883,10 @@ static int patch_stac92hd71bxx(struct hda_codec *codec) #ifdef CONFIG_SND_HDA_POWER_SAVE if (spec->gpio_led) { - if (spec->gpio_led <= 8) { - spec->gpio_mask |= spec->gpio_led; - spec->gpio_dir |= spec->gpio_led; - spec->gpio_data |= spec->gpio_led; - } else { - codec->patch_ops.set_power_state = - stac92xx_set_power_state; - codec->patch_ops.post_suspend = - stac92xx_post_suspend; - } - codec->patch_ops.pre_resume = stac92xx_pre_resume; + spec->gpio_mask |= spec->gpio_led; + spec->gpio_dir |= spec->gpio_led; + spec->gpio_data |= spec->gpio_led; + /* register check_power_status callback. */ codec->patch_ops.check_power_status = stac92xx_check_power_status; }