From d7cb6be0d0cdced2a7a96bd80a8f835e77ec5204 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 9 Dec 2020 11:43:08 -0800 Subject: [PATCH] Revert "md/raid10: improve raid10 discard request" This reverts commit bcc90d280465ebd51ab8688be86e1f00c62dccf9. Matthew Ruffell reported data corruption in raid10 due to the changes in discard handling [1]. Revert these changes before we find a proper fix. [1] https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1907262/ Cc: Matthew Ruffell Cc: Xiao Ni Signed-off-by: Song Liu --- drivers/md/raid10.c | 256 +--------------------------------------------------- 1 file changed, 1 insertion(+), 255 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 05773ee1b5ba..4ad8447fa868 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1516,256 +1516,6 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors) raid10_write_request(mddev, bio, r10_bio); } -static struct bio *raid10_split_bio(struct r10conf *conf, - struct bio *bio, sector_t sectors, bool want_first) -{ - struct bio *split; - - split = bio_split(bio, sectors, GFP_NOIO, &conf->bio_split); - bio_chain(split, bio); - allow_barrier(conf); - if (want_first) { - submit_bio_noacct(bio); - bio = split; - } else - submit_bio_noacct(split); - wait_barrier(conf); - - return bio; -} - -static void raid10_end_discard_request(struct bio *bio) -{ - struct r10bio *r10_bio = bio->bi_private; - struct r10conf *conf = r10_bio->mddev->private; - struct md_rdev *rdev = NULL; - int dev; - int slot, repl; - - /* - * We don't care the return value of discard bio - */ - if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) - set_bit(R10BIO_Uptodate, &r10_bio->state); - - dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl); - if (repl) - rdev = conf->mirrors[dev].replacement; - if (!rdev) { - /* raid10_remove_disk uses smp_mb to make sure rdev is set to - * replacement before setting replacement to NULL. It can read - * rdev first without barrier protect even replacment is NULL - */ - smp_rmb(); - rdev = conf->mirrors[dev].rdev; - } - - if (atomic_dec_and_test(&r10_bio->remaining)) { - md_write_end(r10_bio->mddev); - raid_end_bio_io(r10_bio); - } - - rdev_dec_pending(rdev, conf->mddev); -} - -/* There are some limitations to handle discard bio - * 1st, the discard size is bigger than stripe_size*2. - * 2st, if the discard bio spans reshape progress, we use the old way to - * handle discard bio - */ -static int raid10_handle_discard(struct mddev *mddev, struct bio *bio) -{ - struct r10conf *conf = mddev->private; - struct geom *geo = &conf->geo; - struct r10bio *r10_bio; - - int disk; - sector_t chunk; - unsigned int stripe_size; - sector_t split_size; - - sector_t bio_start, bio_end; - sector_t first_stripe_index, last_stripe_index; - sector_t start_disk_offset; - unsigned int start_disk_index; - sector_t end_disk_offset; - unsigned int end_disk_index; - unsigned int remainder; - - if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) - return -EAGAIN; - - wait_barrier(conf); - - /* Check reshape again to avoid reshape happens after checking - * MD_RECOVERY_RESHAPE and before wait_barrier - */ - if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) - goto out; - - stripe_size = geo->raid_disks << geo->chunk_shift; - bio_start = bio->bi_iter.bi_sector; - bio_end = bio_end_sector(bio); - - /* Maybe one discard bio is smaller than strip size or across one stripe - * and discard region is larger than one stripe size. For far offset layout, - * if the discard region is not aligned with stripe size, there is hole - * when we submit discard bio to member disk. For simplicity, we only - * handle discard bio which discard region is bigger than stripe_size*2 - */ - if (bio_sectors(bio) < stripe_size*2) - goto out; - - /* For far offset layout, if bio is not aligned with stripe size, it splits - * the part that is not aligned with strip size. - */ - div_u64_rem(bio_start, stripe_size, &remainder); - if (geo->far_offset && remainder) { - split_size = stripe_size - remainder; - bio = raid10_split_bio(conf, bio, split_size, false); - } - div_u64_rem(bio_end, stripe_size, &remainder); - if (geo->far_offset && remainder) { - split_size = bio_sectors(bio) - remainder; - bio = raid10_split_bio(conf, bio, split_size, true); - } - - r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO); - r10_bio->mddev = mddev; - r10_bio->state = 0; - r10_bio->sectors = 0; - memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * geo->raid_disks); - - wait_blocked_dev(mddev, r10_bio); - - r10_bio->master_bio = bio; - - bio_start = bio->bi_iter.bi_sector; - bio_end = bio_end_sector(bio); - - /* raid10 uses chunk as the unit to store data. It's similar like raid0. - * One stripe contains the chunks from all member disk (one chunk from - * one disk at the same HBA address). For layout detail, see 'man md 4' - */ - chunk = bio_start >> geo->chunk_shift; - chunk *= geo->near_copies; - first_stripe_index = chunk; - start_disk_index = sector_div(first_stripe_index, geo->raid_disks); - if (geo->far_offset) - first_stripe_index *= geo->far_copies; - start_disk_offset = (bio_start & geo->chunk_mask) + - (first_stripe_index << geo->chunk_shift); - - chunk = bio_end >> geo->chunk_shift; - chunk *= geo->near_copies; - last_stripe_index = chunk; - end_disk_index = sector_div(last_stripe_index, geo->raid_disks); - if (geo->far_offset) - last_stripe_index *= geo->far_copies; - end_disk_offset = (bio_end & geo->chunk_mask) + - (last_stripe_index << geo->chunk_shift); - - rcu_read_lock(); - for (disk = 0; disk < geo->raid_disks; disk++) { - struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev); - struct md_rdev *rrdev = rcu_dereference( - conf->mirrors[disk].replacement); - - r10_bio->devs[disk].bio = NULL; - r10_bio->devs[disk].repl_bio = NULL; - - if (rdev && (test_bit(Faulty, &rdev->flags))) - rdev = NULL; - if (rrdev && (test_bit(Faulty, &rrdev->flags))) - rrdev = NULL; - if (!rdev && !rrdev) - continue; - - if (rdev) { - r10_bio->devs[disk].bio = bio; - atomic_inc(&rdev->nr_pending); - } - if (rrdev) { - r10_bio->devs[disk].repl_bio = bio; - atomic_inc(&rrdev->nr_pending); - } - } - rcu_read_unlock(); - - atomic_set(&r10_bio->remaining, 1); - for (disk = 0; disk < geo->raid_disks; disk++) { - sector_t dev_start, dev_end; - struct bio *mbio, *rbio = NULL; - struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev); - struct md_rdev *rrdev = rcu_dereference( - conf->mirrors[disk].replacement); - - /* - * Now start to calculate the start and end address for each disk. - * The space between dev_start and dev_end is the discard region. - * - * For dev_start, it needs to consider three conditions: - * 1st, the disk is before start_disk, you can imagine the disk in - * the next stripe. So the dev_start is the start address of next - * stripe. - * 2st, the disk is after start_disk, it means the disk is at the - * same stripe of first disk - * 3st, the first disk itself, we can use start_disk_offset directly - */ - if (disk < start_disk_index) - dev_start = (first_stripe_index + 1) * mddev->chunk_sectors; - else if (disk > start_disk_index) - dev_start = first_stripe_index * mddev->chunk_sectors; - else - dev_start = start_disk_offset; - - if (disk < end_disk_index) - dev_end = (last_stripe_index + 1) * mddev->chunk_sectors; - else if (disk > end_disk_index) - dev_end = last_stripe_index * mddev->chunk_sectors; - else - dev_end = end_disk_offset; - - /* It only handles discard bio which size is >= stripe size, so - * dev_end > dev_start all the time - */ - if (r10_bio->devs[disk].bio) { - mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set); - mbio->bi_end_io = raid10_end_discard_request; - mbio->bi_private = r10_bio; - r10_bio->devs[disk].bio = mbio; - r10_bio->devs[disk].devnum = disk; - atomic_inc(&r10_bio->remaining); - md_submit_discard_bio(mddev, rdev, mbio, - dev_start + choose_data_offset(r10_bio, rdev), - dev_end - dev_start); - bio_endio(mbio); - } - if (r10_bio->devs[disk].repl_bio) { - rbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set); - rbio->bi_end_io = raid10_end_discard_request; - rbio->bi_private = r10_bio; - r10_bio->devs[disk].repl_bio = rbio; - r10_bio->devs[disk].devnum = disk; - atomic_inc(&r10_bio->remaining); - md_submit_discard_bio(mddev, rrdev, rbio, - dev_start + choose_data_offset(r10_bio, rrdev), - dev_end - dev_start); - bio_endio(rbio); - } - } - - if (atomic_dec_and_test(&r10_bio->remaining)) { - md_write_end(r10_bio->mddev); - raid_end_bio_io(r10_bio); - } - - return 0; -out: - allow_barrier(conf); - return -EAGAIN; -} - static bool raid10_make_request(struct mddev *mddev, struct bio *bio) { struct r10conf *conf = mddev->private; @@ -1780,10 +1530,6 @@ static bool raid10_make_request(struct mddev *mddev, struct bio *bio) if (!md_write_start(mddev, bio)) return false; - if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) - if (!raid10_handle_discard(mddev, bio)) - return true; - /* * If this request crosses a chunk boundary, we need to split * it. @@ -4023,7 +3769,7 @@ static int raid10_run(struct mddev *mddev) if (mddev->queue) { blk_queue_max_discard_sectors(mddev->queue, - UINT_MAX); + mddev->chunk_sectors); blk_queue_max_write_same_sectors(mddev->queue, 0); blk_queue_max_write_zeroes_sectors(mddev->queue, 0); blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9); -- 2.11.0