OSDN Git Service

raid5: handle expansion/resync case with stripe batching
[sagit-ice-cold/kernel_xiaomi_msm8998.git] / drivers / md / raid5.c
index cd2f96b..3ae097d 100644 (file)
@@ -54,6 +54,7 @@
 #include <linux/slab.h>
 #include <linux/ratelimit.h>
 #include <linux/nodemask.h>
+#include <linux/flex_array.h>
 #include <trace/events/block.h>
 
 #include "md.h"
@@ -525,6 +526,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
        BUG_ON(atomic_read(&sh->count) != 0);
        BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
        BUG_ON(stripe_operations_active(sh));
+       BUG_ON(sh->batch_head);
 
        pr_debug("init_stripe called, stripe %llu\n",
                (unsigned long long)sector);
@@ -552,8 +554,10 @@ retry:
        }
        if (read_seqcount_retry(&conf->gen_lock, seq))
                goto retry;
+       sh->overwrite_disks = 0;
        insert_hash(conf, sh);
        sh->cpu = smp_processor_id();
+       set_bit(STRIPE_BATCH_READY, &sh->state);
 }
 
 static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
@@ -708,6 +712,130 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
        return sh;
 }
 
+static bool is_full_stripe_write(struct stripe_head *sh)
+{
+       BUG_ON(sh->overwrite_disks > (sh->disks - sh->raid_conf->max_degraded));
+       return sh->overwrite_disks == (sh->disks - sh->raid_conf->max_degraded);
+}
+
+static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
+{
+       local_irq_disable();
+       if (sh1 > sh2) {
+               spin_lock(&sh2->stripe_lock);
+               spin_lock_nested(&sh1->stripe_lock, 1);
+       } else {
+               spin_lock(&sh1->stripe_lock);
+               spin_lock_nested(&sh2->stripe_lock, 1);
+       }
+}
+
+static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
+{
+       spin_unlock(&sh1->stripe_lock);
+       spin_unlock(&sh2->stripe_lock);
+       local_irq_enable();
+}
+
+/* Only freshly new full stripe normal write stripe can be added to a batch list */
+static bool stripe_can_batch(struct stripe_head *sh)
+{
+       return test_bit(STRIPE_BATCH_READY, &sh->state) &&
+               is_full_stripe_write(sh);
+}
+
+/* we only do back search */
+static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh)
+{
+       struct stripe_head *head;
+       sector_t head_sector, tmp_sec;
+       int hash;
+       int dd_idx;
+
+       if (!stripe_can_batch(sh))
+               return;
+       /* Don't cross chunks, so stripe pd_idx/qd_idx is the same */
+       tmp_sec = sh->sector;
+       if (!sector_div(tmp_sec, conf->chunk_sectors))
+               return;
+       head_sector = sh->sector - STRIPE_SECTORS;
+
+       hash = stripe_hash_locks_hash(head_sector);
+       spin_lock_irq(conf->hash_locks + hash);
+       head = __find_stripe(conf, head_sector, conf->generation);
+       if (head && !atomic_inc_not_zero(&head->count)) {
+               spin_lock(&conf->device_lock);
+               if (!atomic_read(&head->count)) {
+                       if (!test_bit(STRIPE_HANDLE, &head->state))
+                               atomic_inc(&conf->active_stripes);
+                       BUG_ON(list_empty(&head->lru) &&
+                              !test_bit(STRIPE_EXPANDING, &head->state));
+                       list_del_init(&head->lru);
+                       if (head->group) {
+                               head->group->stripes_cnt--;
+                               head->group = NULL;
+                       }
+               }
+               atomic_inc(&head->count);
+               spin_unlock(&conf->device_lock);
+       }
+       spin_unlock_irq(conf->hash_locks + hash);
+
+       if (!head)
+               return;
+       if (!stripe_can_batch(head))
+               goto out;
+
+       lock_two_stripes(head, sh);
+       /* clear_batch_ready clear the flag */
+       if (!stripe_can_batch(head) || !stripe_can_batch(sh))
+               goto unlock_out;
+
+       if (sh->batch_head)
+               goto unlock_out;
+
+       dd_idx = 0;
+       while (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx)
+               dd_idx++;
+       if (head->dev[dd_idx].towrite->bi_rw != sh->dev[dd_idx].towrite->bi_rw)
+               goto unlock_out;
+
+       if (head->batch_head) {
+               spin_lock(&head->batch_head->batch_lock);
+               /* This batch list is already running */
+               if (!stripe_can_batch(head)) {
+                       spin_unlock(&head->batch_head->batch_lock);
+                       goto unlock_out;
+               }
+
+               /*
+                * at this point, head's BATCH_READY could be cleared, but we
+                * can still add the stripe to batch list
+                */
+               list_add(&sh->batch_list, &head->batch_list);
+               spin_unlock(&head->batch_head->batch_lock);
+
+               sh->batch_head = head->batch_head;
+       } else {
+               head->batch_head = head;
+               sh->batch_head = head->batch_head;
+               spin_lock(&head->batch_lock);
+               list_add_tail(&sh->batch_list, &head->batch_list);
+               spin_unlock(&head->batch_lock);
+       }
+
+       if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+               if (atomic_dec_return(&conf->preread_active_stripes)
+                   < IO_THRESHOLD)
+                       md_wakeup_thread(conf->mddev->thread);
+
+       atomic_inc(&sh->count);
+unlock_out:
+       unlock_two_stripes(head, sh);
+out:
+       release_stripe(head);
+}
+
 /* Determine if 'data_offset' or 'new_data_offset' should be used
  * in this stripe_head.
  */
@@ -738,6 +866,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 {
        struct r5conf *conf = sh->raid_conf;
        int i, disks = sh->disks;
+       struct stripe_head *head_sh = sh;
 
        might_sleep();
 
@@ -746,6 +875,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
                int replace_only = 0;
                struct bio *bi, *rbi;
                struct md_rdev *rdev, *rrdev = NULL;
+
+               sh = head_sh;
                if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
                        if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
                                rw = WRITE_FUA;
@@ -764,6 +895,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
                if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags))
                        rw |= REQ_SYNC;
 
+again:
                bi = &sh->dev[i].req;
                rbi = &sh->dev[i].rreq; /* For writing to replacement */
 
@@ -782,7 +914,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
                                /* We raced and saw duplicates */
                                rrdev = NULL;
                } else {
-                       if (test_bit(R5_ReadRepl, &sh->dev[i].flags) && rrdev)
+                       if (test_bit(R5_ReadRepl, &head_sh->dev[i].flags) && rrdev)
                                rdev = rrdev;
                        rrdev = NULL;
                }
@@ -853,13 +985,15 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
                                __func__, (unsigned long long)sh->sector,
                                bi->bi_rw, i);
                        atomic_inc(&sh->count);
+                       if (sh != head_sh)
+                               atomic_inc(&head_sh->count);
                        if (use_new_offset(conf, sh))
                                bi->bi_iter.bi_sector = (sh->sector
                                                 + rdev->new_data_offset);
                        else
                                bi->bi_iter.bi_sector = (sh->sector
                                                 + rdev->data_offset);
-                       if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
+                       if (test_bit(R5_ReadNoMerge, &head_sh->dev[i].flags))
                                bi->bi_rw |= REQ_NOMERGE;
 
                        if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
@@ -903,6 +1037,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
                                __func__, (unsigned long long)sh->sector,
                                rbi->bi_rw, i);
                        atomic_inc(&sh->count);
+                       if (sh != head_sh)
+                               atomic_inc(&head_sh->count);
                        if (use_new_offset(conf, sh))
                                rbi->bi_iter.bi_sector = (sh->sector
                                                  + rrdev->new_data_offset);
@@ -934,8 +1070,18 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
                        pr_debug("skip op %ld on disc %d for sector %llu\n",
                                bi->bi_rw, i, (unsigned long long)sh->sector);
                        clear_bit(R5_LOCKED, &sh->dev[i].flags);
+                       if (sh->batch_head)
+                               set_bit(STRIPE_BATCH_ERR,
+                                       &sh->batch_head->state);
                        set_bit(STRIPE_HANDLE, &sh->state);
                }
+
+               if (!head_sh->batch_head)
+                       continue;
+               sh = list_first_entry(&sh->batch_list, struct stripe_head,
+                                     batch_list);
+               if (sh != head_sh)
+                       goto again;
        }
 }
 
@@ -1051,6 +1197,7 @@ static void ops_run_biofill(struct stripe_head *sh)
        struct async_submit_ctl submit;
        int i;
 
+       BUG_ON(sh->batch_head);
        pr_debug("%s: stripe %llu\n", __func__,
                (unsigned long long)sh->sector);
 
@@ -1109,16 +1256,28 @@ static void ops_complete_compute(void *stripe_head_ref)
 
 /* return a pointer to the address conversion region of the scribble buffer */
 static addr_conv_t *to_addr_conv(struct stripe_head *sh,
-                                struct raid5_percpu *percpu)
+                                struct raid5_percpu *percpu, int i)
+{
+       void *addr;
+
+       addr = flex_array_get(percpu->scribble, i);
+       return addr + sizeof(struct page *) * (sh->disks + 2);
+}
+
+/* return a pointer to the address conversion region of the scribble buffer */
+static struct page **to_addr_page(struct raid5_percpu *percpu, int i)
 {
-       return percpu->scribble + sizeof(struct page *) * (sh->disks + 2);
+       void *addr;
+
+       addr = flex_array_get(percpu->scribble, i);
+       return addr;
 }
 
 static struct dma_async_tx_descriptor *
 ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
 {
        int disks = sh->disks;
-       struct page **xor_srcs = percpu->scribble;
+       struct page **xor_srcs = to_addr_page(percpu, 0);
        int target = sh->ops.target;
        struct r5dev *tgt = &sh->dev[target];
        struct page *xor_dest = tgt->page;
@@ -1127,6 +1286,8 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
        struct async_submit_ctl submit;
        int i;
 
+       BUG_ON(sh->batch_head);
+
        pr_debug("%s: stripe %llu block: %d\n",
                __func__, (unsigned long long)sh->sector, target);
        BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
@@ -1138,7 +1299,7 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
        atomic_inc(&sh->count);
 
        init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
-                         ops_complete_compute, sh, to_addr_conv(sh, percpu));
+                         ops_complete_compute, sh, to_addr_conv(sh, percpu, 0));
        if (unlikely(count == 1))
                tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
        else
@@ -1183,7 +1344,7 @@ static struct dma_async_tx_descriptor *
 ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
 {
        int disks = sh->disks;
-       struct page **blocks = percpu->scribble;
+       struct page **blocks = to_addr_page(percpu, 0);
        int target;
        int qd_idx = sh->qd_idx;
        struct dma_async_tx_descriptor *tx;
@@ -1193,6 +1354,7 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
        int i;
        int count;
 
+       BUG_ON(sh->batch_head);
        if (sh->ops.target < 0)
                target = sh->ops.target2;
        else if (sh->ops.target2 < 0)
@@ -1216,7 +1378,7 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
                BUG_ON(blocks[count+1] != dest); /* q should already be set */
                init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
                                  ops_complete_compute, sh,
-                                 to_addr_conv(sh, percpu));
+                                 to_addr_conv(sh, percpu, 0));
                tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
        } else {
                /* Compute any data- or p-drive using XOR */
@@ -1229,7 +1391,7 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
 
                init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
                                  NULL, ops_complete_compute, sh,
-                                 to_addr_conv(sh, percpu));
+                                 to_addr_conv(sh, percpu, 0));
                tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit);
        }
 
@@ -1248,9 +1410,10 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
        struct r5dev *tgt = &sh->dev[target];
        struct r5dev *tgt2 = &sh->dev[target2];
        struct dma_async_tx_descriptor *tx;
-       struct page **blocks = percpu->scribble;
+       struct page **blocks = to_addr_page(percpu, 0);
        struct async_submit_ctl submit;
 
+       BUG_ON(sh->batch_head);
        pr_debug("%s: stripe %llu block1: %d block2: %d\n",
                 __func__, (unsigned long long)sh->sector, target, target2);
        BUG_ON(target < 0 || target2 < 0);
@@ -1290,7 +1453,7 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
                        /* Missing P+Q, just recompute */
                        init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
                                          ops_complete_compute, sh,
-                                         to_addr_conv(sh, percpu));
+                                         to_addr_conv(sh, percpu, 0));
                        return async_gen_syndrome(blocks, 0, syndrome_disks+2,
                                                  STRIPE_SIZE, &submit);
                } else {
@@ -1314,21 +1477,21 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
                        init_async_submit(&submit,
                                          ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
                                          NULL, NULL, NULL,
-                                         to_addr_conv(sh, percpu));
+                                         to_addr_conv(sh, percpu, 0));
                        tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
                                       &submit);
 
                        count = set_syndrome_sources(blocks, sh);
                        init_async_submit(&submit, ASYNC_TX_FENCE, tx,
                                          ops_complete_compute, sh,
-                                         to_addr_conv(sh, percpu));
+                                         to_addr_conv(sh, percpu, 0));
                        return async_gen_syndrome(blocks, 0, count+2,
                                                  STRIPE_SIZE, &submit);
                }
        } else {
                init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
                                  ops_complete_compute, sh,
-                                 to_addr_conv(sh, percpu));
+                                 to_addr_conv(sh, percpu, 0));
                if (failb == syndrome_disks) {
                        /* We're missing D+P. */
                        return async_raid6_datap_recov(syndrome_disks+2,
@@ -1356,13 +1519,14 @@ ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
               struct dma_async_tx_descriptor *tx)
 {
        int disks = sh->disks;
-       struct page **xor_srcs = percpu->scribble;
+       struct page **xor_srcs = to_addr_page(percpu, 0);
        int count = 0, pd_idx = sh->pd_idx, i;
        struct async_submit_ctl submit;
 
        /* existing parity data subtracted */
        struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
 
+       BUG_ON(sh->batch_head);
        pr_debug("%s: stripe %llu\n", __func__,
                (unsigned long long)sh->sector);
 
@@ -1374,7 +1538,7 @@ ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
        }
 
        init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
-                         ops_complete_prexor, sh, to_addr_conv(sh, percpu));
+                         ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
        tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
 
        return tx;
@@ -1385,20 +1549,25 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 {
        int disks = sh->disks;
        int i;
+       struct stripe_head *head_sh = sh;
 
        pr_debug("%s: stripe %llu\n", __func__,
                (unsigned long long)sh->sector);
 
        for (i = disks; i--; ) {
-               struct r5dev *dev = &sh->dev[i];
+               struct r5dev *dev;
                struct bio *chosen;
 
-               if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) {
+               sh = head_sh;
+               if (test_and_clear_bit(R5_Wantdrain, &head_sh->dev[i].flags)) {
                        struct bio *wbi;
 
+again:
+                       dev = &sh->dev[i];
                        spin_lock_irq(&sh->stripe_lock);
                        chosen = dev->towrite;
                        dev->towrite = NULL;
+                       sh->overwrite_disks = 0;
                        BUG_ON(dev->written);
                        wbi = dev->written = chosen;
                        spin_unlock_irq(&sh->stripe_lock);
@@ -1423,6 +1592,15 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
                                }
                                wbi = r5_next_bio(wbi, dev->sector);
                        }
+
+                       if (head_sh->batch_head) {
+                               sh = list_first_entry(&sh->batch_list,
+                                                     struct stripe_head,
+                                                     batch_list);
+                               if (sh == head_sh)
+                                       continue;
+                               goto again;
+                       }
                }
        }
 
@@ -1478,12 +1656,15 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
                     struct dma_async_tx_descriptor *tx)
 {
        int disks = sh->disks;
-       struct page **xor_srcs = percpu->scribble;
+       struct page **xor_srcs;
        struct async_submit_ctl submit;
-       int count = 0, pd_idx = sh->pd_idx, i;
+       int count, pd_idx = sh->pd_idx, i;
        struct page *xor_dest;
        int prexor = 0;
        unsigned long flags;
+       int j = 0;
+       struct stripe_head *head_sh = sh;
+       int last_stripe;
 
        pr_debug("%s: stripe %llu\n", __func__,
                (unsigned long long)sh->sector);
@@ -1500,15 +1681,18 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
                ops_complete_reconstruct(sh);
                return;
        }
+again:
+       count = 0;
+       xor_srcs = to_addr_page(percpu, j);
        /* check if prexor is active which means only process blocks
         * that are part of a read-modify-write (written)
         */
-       if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
+       if (head_sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
                prexor = 1;
                xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
                for (i = disks; i--; ) {
                        struct r5dev *dev = &sh->dev[i];
-                       if (dev->written)
+                       if (head_sh->dev[i].written)
                                xor_srcs[count++] = dev->page;
                }
        } else {
@@ -1525,17 +1709,32 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
         * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
         * for the synchronous xor case
         */
-       flags = ASYNC_TX_ACK |
-               (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
-
-       atomic_inc(&sh->count);
+       last_stripe = !head_sh->batch_head ||
+               list_first_entry(&sh->batch_list,
+                                struct stripe_head, batch_list) == head_sh;
+       if (last_stripe) {
+               flags = ASYNC_TX_ACK |
+                       (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
+
+               atomic_inc(&head_sh->count);
+               init_async_submit(&submit, flags, tx, ops_complete_reconstruct, head_sh,
+                                 to_addr_conv(sh, percpu, j));
+       } else {
+               flags = prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST;
+               init_async_submit(&submit, flags, tx, NULL, NULL,
+                                 to_addr_conv(sh, percpu, j));
+       }
 
-       init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh,
-                         to_addr_conv(sh, percpu));
        if (unlikely(count == 1))
                tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
        else
                tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
+       if (!last_stripe) {
+               j++;
+               sh = list_first_entry(&sh->batch_list, struct stripe_head,
+                                     batch_list);
+               goto again;
+       }
 }
 
 static void
@@ -1543,8 +1742,10 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
                     struct dma_async_tx_descriptor *tx)
 {
        struct async_submit_ctl submit;
-       struct page **blocks = percpu->scribble;
-       int count, i;
+       struct page **blocks;
+       int count, i, j = 0;
+       struct stripe_head *head_sh = sh;
+       int last_stripe;
 
        pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
 
@@ -1562,13 +1763,27 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
                return;
        }
 
+again:
+       blocks = to_addr_page(percpu, j);
        count = set_syndrome_sources(blocks, sh);
-
-       atomic_inc(&sh->count);
-
-       init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct,
-                         sh, to_addr_conv(sh, percpu));
+       last_stripe = !head_sh->batch_head ||
+               list_first_entry(&sh->batch_list,
+                                struct stripe_head, batch_list) == head_sh;
+
+       if (last_stripe) {
+               atomic_inc(&head_sh->count);
+               init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct,
+                                 head_sh, to_addr_conv(sh, percpu, j));
+       } else
+               init_async_submit(&submit, 0, tx, NULL, NULL,
+                                 to_addr_conv(sh, percpu, j));
        async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE,  &submit);
+       if (!last_stripe) {
+               j++;
+               sh = list_first_entry(&sh->batch_list, struct stripe_head,
+                                     batch_list);
+               goto again;
+       }
 }
 
 static void ops_complete_check(void *stripe_head_ref)
@@ -1589,7 +1804,7 @@ static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
        int pd_idx = sh->pd_idx;
        int qd_idx = sh->qd_idx;
        struct page *xor_dest;
-       struct page **xor_srcs = percpu->scribble;
+       struct page **xor_srcs = to_addr_page(percpu, 0);
        struct dma_async_tx_descriptor *tx;
        struct async_submit_ctl submit;
        int count;
@@ -1598,6 +1813,7 @@ static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
        pr_debug("%s: stripe %llu\n", __func__,
                (unsigned long long)sh->sector);
 
+       BUG_ON(sh->batch_head);
        count = 0;
        xor_dest = sh->dev[pd_idx].page;
        xor_srcs[count++] = xor_dest;
@@ -1608,7 +1824,7 @@ static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
        }
 
        init_async_submit(&submit, 0, NULL, NULL, NULL,
-                         to_addr_conv(sh, percpu));
+                         to_addr_conv(sh, percpu, 0));
        tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
                           &sh->ops.zero_sum_result, &submit);
 
@@ -1619,20 +1835,21 @@ static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
 
 static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp)
 {
-       struct page **srcs = percpu->scribble;
+       struct page **srcs = to_addr_page(percpu, 0);
        struct async_submit_ctl submit;
        int count;
 
        pr_debug("%s: stripe %llu checkp: %d\n", __func__,
                (unsigned long long)sh->sector, checkp);
 
+       BUG_ON(sh->batch_head);
        count = set_syndrome_sources(srcs, sh);
        if (!checkp)
                srcs[count] = NULL;
 
        atomic_inc(&sh->count);
        init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check,
-                         sh, to_addr_conv(sh, percpu));
+                         sh, to_addr_conv(sh, percpu, 0));
        async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE,
                           &sh->ops.zero_sum_result, percpu->spare_page, &submit);
 }
@@ -1693,7 +1910,7 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
                        BUG();
        }
 
-       if (overlap_clear)
+       if (overlap_clear && !sh->batch_head)
                for (i = disks; i--; ) {
                        struct r5dev *dev = &sh->dev[i];
                        if (test_and_clear_bit(R5_Overlap, &dev->flags))
@@ -1723,6 +1940,10 @@ static int grow_one_stripe(struct r5conf *conf, int hash)
        atomic_set(&sh->count, 1);
        atomic_inc(&conf->active_stripes);
        INIT_LIST_HEAD(&sh->lru);
+
+       spin_lock_init(&sh->batch_lock);
+       INIT_LIST_HEAD(&sh->batch_list);
+       sh->batch_head = NULL;
        release_stripe(sh);
        return 1;
 }
@@ -1772,13 +1993,21 @@ static int grow_stripes(struct r5conf *conf, int num)
  * calculate over all devices (not just the data blocks), using zeros in place
  * of the P and Q blocks.
  */
-static size_t scribble_len(int num)
+static struct flex_array *scribble_alloc(int num, int cnt, gfp_t flags)
 {
+       struct flex_array *ret;
        size_t len;
 
        len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2);
-
-       return len;
+       ret = flex_array_alloc(len, cnt, flags);
+       if (!ret)
+               return NULL;
+       /* always prealloc all elements, so no locking is required */
+       if (flex_array_prealloc(ret, 0, cnt, flags)) {
+               flex_array_free(ret);
+               return NULL;
+       }
+       return ret;
 }
 
 static int resize_stripes(struct r5conf *conf, int newsize)
@@ -1896,16 +2125,16 @@ static int resize_stripes(struct r5conf *conf, int newsize)
                err = -ENOMEM;
 
        get_online_cpus();
-       conf->scribble_len = scribble_len(newsize);
        for_each_present_cpu(cpu) {
                struct raid5_percpu *percpu;
-               void *scribble;
+               struct flex_array *scribble;
 
                percpu = per_cpu_ptr(conf->percpu, cpu);
-               scribble = kmalloc(conf->scribble_len, GFP_NOIO);
+               scribble = scribble_alloc(newsize, conf->chunk_sectors /
+                       STRIPE_SECTORS, GFP_NOIO);
 
                if (scribble) {
-                       kfree(percpu->scribble);
+                       flex_array_free(percpu->scribble);
                        percpu->scribble = scribble;
                } else {
                        err = -ENOMEM;
@@ -2154,10 +2383,16 @@ static void raid5_end_write_request(struct bio *bi, int error)
        }
        rdev_dec_pending(rdev, conf->mddev);
 
+       if (sh->batch_head && !uptodate)
+               set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state);
+
        if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
                clear_bit(R5_LOCKED, &sh->dev[i].flags);
        set_bit(STRIPE_HANDLE, &sh->state);
        release_stripe(sh);
+
+       if (sh->batch_head && sh != sh->batch_head)
+               release_stripe(sh->batch_head);
 }
 
 static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);
@@ -2624,7 +2859,8 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
  * toread/towrite point to the first in a chain.
  * The bi_next chain must be in order.
  */
-static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
+static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
+                         int forwrite, int previous)
 {
        struct bio **bip;
        struct r5conf *conf = sh->raid_conf;
@@ -2643,6 +2879,9 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
         * protect it.
         */
        spin_lock_irq(&sh->stripe_lock);
+       /* Don't allow new IO added to stripes in batch list */
+       if (sh->batch_head)
+               goto overlap;
        if (forwrite) {
                bip = &sh->dev[dd_idx].towrite;
                if (*bip == NULL)
@@ -2657,6 +2896,9 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
        if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi))
                goto overlap;
 
+       if (!forwrite || previous)
+               clear_bit(STRIPE_BATCH_READY, &sh->state);
+
        BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
        if (*bip)
                bi->bi_next = *bip;
@@ -2674,7 +2916,8 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
                                sector = bio_end_sector(bi);
                }
                if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
-                       set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
+                       if (!test_and_set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags))
+                               sh->overwrite_disks++;
        }
 
        pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
@@ -2688,6 +2931,9 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
                sh->bm_seq = conf->seq_flush+1;
                set_bit(STRIPE_BIT_DELAY, &sh->state);
        }
+
+       if (stripe_can_batch(sh))
+               stripe_add_to_batch_list(conf, sh);
        return 1;
 
  overlap:
@@ -2720,6 +2966,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
                                struct bio **return_bi)
 {
        int i;
+       BUG_ON(sh->batch_head);
        for (i = disks; i--; ) {
                struct bio *bi;
                int bitmap_end = 0;
@@ -2746,6 +2993,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
                /* fail all writes first */
                bi = sh->dev[i].towrite;
                sh->dev[i].towrite = NULL;
+               sh->overwrite_disks = 0;
                spin_unlock_irq(&sh->stripe_lock);
                if (bi)
                        bitmap_end = 1;
@@ -2834,6 +3082,7 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
        int abort = 0;
        int i;
 
+       BUG_ON(sh->batch_head);
        clear_bit(STRIPE_SYNCING, &sh->state);
        if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
                wake_up(&conf->wait_for_overlap);
@@ -3064,6 +3313,7 @@ static void handle_stripe_fill(struct stripe_head *sh,
 {
        int i;
 
+       BUG_ON(sh->batch_head);
        /* look for blocks to read/compute, skip this if a compute
         * is already in flight, or if the stripe contents are in the
         * midst of changing due to a write
@@ -3087,6 +3337,9 @@ static void handle_stripe_clean_event(struct r5conf *conf,
        int i;
        struct r5dev *dev;
        int discard_pending = 0;
+       struct stripe_head *head_sh = sh;
+       bool do_endio = false;
+       int wakeup_nr = 0;
 
        for (i = disks; i--; )
                if (sh->dev[i].written) {
@@ -3102,8 +3355,11 @@ static void handle_stripe_clean_event(struct r5conf *conf,
                                        clear_bit(R5_UPTODATE, &dev->flags);
                                if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) {
                                        WARN_ON(test_bit(R5_UPTODATE, &dev->flags));
-                                       dev->page = dev->orig_page;
                                }
+                               do_endio = true;
+
+returnbi:
+                               dev->page = dev->orig_page;
                                wbi = dev->written;
                                dev->written = NULL;
                                while (wbi && wbi->bi_iter.bi_sector <
@@ -3120,6 +3376,17 @@ static void handle_stripe_clean_event(struct r5conf *conf,
                                                STRIPE_SECTORS,
                                         !test_bit(STRIPE_DEGRADED, &sh->state),
                                                0);
+                               if (head_sh->batch_head) {
+                                       sh = list_first_entry(&sh->batch_list,
+                                                             struct stripe_head,
+                                                             batch_list);
+                                       if (sh != head_sh) {
+                                               dev = &sh->dev[i];
+                                               goto returnbi;
+                                       }
+                               }
+                               sh = head_sh;
+                               dev = &sh->dev[i];
                        } else if (test_bit(R5_Discard, &dev->flags))
                                discard_pending = 1;
                        WARN_ON(test_bit(R5_SkipCopy, &dev->flags));
@@ -3141,8 +3408,17 @@ static void handle_stripe_clean_event(struct r5conf *conf,
                 * will be reinitialized
                 */
                spin_lock_irq(&conf->device_lock);
+unhash:
                remove_hash(sh);
+               if (head_sh->batch_head) {
+                       sh = list_first_entry(&sh->batch_list,
+                                             struct stripe_head, batch_list);
+                       if (sh != head_sh)
+                                       goto unhash;
+               }
                spin_unlock_irq(&conf->device_lock);
+               sh = head_sh;
+
                if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state))
                        set_bit(STRIPE_HANDLE, &sh->state);
 
@@ -3151,6 +3427,45 @@ static void handle_stripe_clean_event(struct r5conf *conf,
        if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
                if (atomic_dec_and_test(&conf->pending_full_writes))
                        md_wakeup_thread(conf->mddev->thread);
+
+       if (!head_sh->batch_head || !do_endio)
+               return;
+       for (i = 0; i < head_sh->disks; i++) {
+               if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags))
+                       wakeup_nr++;
+       }
+       while (!list_empty(&head_sh->batch_list)) {
+               int i;
+               sh = list_first_entry(&head_sh->batch_list,
+                                     struct stripe_head, batch_list);
+               list_del_init(&sh->batch_list);
+
+               set_mask_bits(&sh->state, ~STRIPE_EXPAND_SYNC_FLAG,
+                             head_sh->state & ~((1 << STRIPE_ACTIVE) |
+                                                (1 << STRIPE_PREREAD_ACTIVE) |
+                                                STRIPE_EXPAND_SYNC_FLAG));
+               sh->check_state = head_sh->check_state;
+               sh->reconstruct_state = head_sh->reconstruct_state;
+               for (i = 0; i < sh->disks; i++) {
+                       if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
+                               wakeup_nr++;
+                       sh->dev[i].flags = head_sh->dev[i].flags;
+               }
+
+               spin_lock_irq(&sh->stripe_lock);
+               sh->batch_head = NULL;
+               spin_unlock_irq(&sh->stripe_lock);
+               if (sh->state & STRIPE_EXPAND_SYNC_FLAG)
+                       set_bit(STRIPE_HANDLE, &sh->state);
+               release_stripe(sh);
+       }
+
+       spin_lock_irq(&head_sh->stripe_lock);
+       head_sh->batch_head = NULL;
+       spin_unlock_irq(&head_sh->stripe_lock);
+       wake_up_nr(&conf->wait_for_overlap, wakeup_nr);
+       if (head_sh->state & STRIPE_EXPAND_SYNC_FLAG)
+               set_bit(STRIPE_HANDLE, &head_sh->state);
 }
 
 static void handle_stripe_dirtying(struct r5conf *conf,
@@ -3290,6 +3605,7 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
 {
        struct r5dev *dev = NULL;
 
+       BUG_ON(sh->batch_head);
        set_bit(STRIPE_HANDLE, &sh->state);
 
        switch (sh->check_state) {
@@ -3380,6 +3696,7 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
        int qd_idx = sh->qd_idx;
        struct r5dev *dev;
 
+       BUG_ON(sh->batch_head);
        set_bit(STRIPE_HANDLE, &sh->state);
 
        BUG_ON(s->failed > 2);
@@ -3543,6 +3860,7 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
         * copy some of them into a target stripe for expand.
         */
        struct dma_async_tx_descriptor *tx = NULL;
+       BUG_ON(sh->batch_head);
        clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
        for (i = 0; i < sh->disks; i++)
                if (i != sh->pd_idx && i != sh->qd_idx) {
@@ -3615,8 +3933,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
 
        memset(s, 0, sizeof(*s));
 
-       s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
-       s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
+       s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state) && !sh->batch_head;
+       s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state) && !sh->batch_head;
        s->failed_num[0] = -1;
        s->failed_num[1] = -1;
 
@@ -3786,6 +4104,80 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
        rcu_read_unlock();
 }
 
+static int clear_batch_ready(struct stripe_head *sh)
+{
+       struct stripe_head *tmp;
+       if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state))
+               return 0;
+       spin_lock(&sh->stripe_lock);
+       if (!sh->batch_head) {
+               spin_unlock(&sh->stripe_lock);
+               return 0;
+       }
+
+       /*
+        * this stripe could be added to a batch list before we check
+        * BATCH_READY, skips it
+        */
+       if (sh->batch_head != sh) {
+               spin_unlock(&sh->stripe_lock);
+               return 1;
+       }
+       spin_lock(&sh->batch_lock);
+       list_for_each_entry(tmp, &sh->batch_list, batch_list)
+               clear_bit(STRIPE_BATCH_READY, &tmp->state);
+       spin_unlock(&sh->batch_lock);
+       spin_unlock(&sh->stripe_lock);
+
+       /*
+        * BATCH_READY is cleared, no new stripes can be added.
+        * batch_list can be accessed without lock
+        */
+       return 0;
+}
+
+static void check_break_stripe_batch_list(struct stripe_head *sh)
+{
+       struct stripe_head *head_sh, *next;
+       int i;
+
+       if (!test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state))
+               return;
+
+       head_sh = sh;
+       do {
+               sh = list_first_entry(&sh->batch_list,
+                                     struct stripe_head, batch_list);
+               BUG_ON(sh == head_sh);
+       } while (!test_bit(STRIPE_DEGRADED, &sh->state));
+
+       while (sh != head_sh) {
+               next = list_first_entry(&sh->batch_list,
+                                       struct stripe_head, batch_list);
+               list_del_init(&sh->batch_list);
+
+               set_mask_bits(&sh->state, ~STRIPE_EXPAND_SYNC_FLAG,
+                             head_sh->state & ~((1 << STRIPE_ACTIVE) |
+                                                (1 << STRIPE_PREREAD_ACTIVE) |
+                                                (1 << STRIPE_DEGRADED) |
+                                                STRIPE_EXPAND_SYNC_FLAG));
+               sh->check_state = head_sh->check_state;
+               sh->reconstruct_state = head_sh->reconstruct_state;
+               for (i = 0; i < sh->disks; i++)
+                       sh->dev[i].flags = head_sh->dev[i].flags &
+                               (~((1 << R5_WriteError) | (1 << R5_Overlap)));
+
+               spin_lock_irq(&sh->stripe_lock);
+               sh->batch_head = NULL;
+               spin_unlock_irq(&sh->stripe_lock);
+
+               set_bit(STRIPE_HANDLE, &sh->state);
+               release_stripe(sh);
+
+               sh = next;
+       }
+}
+
 static void handle_stripe(struct stripe_head *sh)
 {
        struct stripe_head_state s;
@@ -3803,7 +4195,14 @@ static void handle_stripe(struct stripe_head *sh)
                return;
        }
 
-       if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
+       if (clear_batch_ready(sh) ) {
+               clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
+               return;
+       }
+
+       check_break_stripe_batch_list(sh);
+
+       if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) {
                spin_lock(&sh->stripe_lock);
                /* Cannot process 'sync' concurrently with 'discard' */
                if (!test_bit(STRIPE_DISCARD, &sh->state) &&
@@ -4603,12 +5002,14 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
                }
                set_bit(STRIPE_DISCARD, &sh->state);
                finish_wait(&conf->wait_for_overlap, &w);
+               sh->overwrite_disks = 0;
                for (d = 0; d < conf->raid_disks; d++) {
                        if (d == sh->pd_idx || d == sh->qd_idx)
                                continue;
                        sh->dev[d].towrite = bi;
                        set_bit(R5_OVERWRITE, &sh->dev[d].flags);
                        raid5_inc_bi_active_stripes(bi);
+                       sh->overwrite_disks++;
                }
                spin_unlock_irq(&sh->stripe_lock);
                if (conf->mddev->bitmap) {
@@ -4772,7 +5173,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
                        }
 
                        if (test_bit(STRIPE_EXPANDING, &sh->state) ||
-                           !add_stripe_bio(sh, bi, dd_idx, rw)) {
+                           !add_stripe_bio(sh, bi, dd_idx, rw, previous)) {
                                /* Stripe is busy expanding or
                                 * add failed due to overlap.  Flush everything
                                 * and wait a while
@@ -4785,7 +5186,8 @@ static void make_request(struct mddev *mddev, struct bio * bi)
                        }
                        set_bit(STRIPE_HANDLE, &sh->state);
                        clear_bit(STRIPE_DELAYED, &sh->state);
-                       if ((bi->bi_rw & REQ_SYNC) &&
+                       if ((!sh->batch_head || sh == sh->batch_head) &&
+                           (bi->bi_rw & REQ_SYNC) &&
                            !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
                                atomic_inc(&conf->preread_active_stripes);
                        release_stripe_plug(mddev, sh);
@@ -5050,8 +5452,7 @@ ret:
        return reshape_sectors;
 }
 
-/* FIXME go_faster isn't used */
-static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster)
+static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped)
 {
        struct r5conf *conf = mddev->private;
        struct stripe_head *sh;
@@ -5186,7 +5587,7 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
                        return handled;
                }
 
-               if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) {
+               if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) {
                        release_stripe(sh);
                        raid5_set_bi_processed_stripes(raid_bio, scnt);
                        conf->retry_read_aligned = raid_bio;
@@ -5699,7 +6100,8 @@ raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks)
 static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
 {
        safe_put_page(percpu->spare_page);
-       kfree(percpu->scribble);
+       if (percpu->scribble)
+               flex_array_free(percpu->scribble);
        percpu->spare_page = NULL;
        percpu->scribble = NULL;
 }
@@ -5709,7 +6111,9 @@ static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu
        if (conf->level == 6 && !percpu->spare_page)
                percpu->spare_page = alloc_page(GFP_KERNEL);
        if (!percpu->scribble)
-               percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL);
+               percpu->scribble = scribble_alloc(max(conf->raid_disks,
+                       conf->previous_raid_disks), conf->chunk_sectors /
+                       STRIPE_SECTORS, GFP_KERNEL);
 
        if (!percpu->scribble || (conf->level == 6 && !percpu->spare_page)) {
                free_scratch_buffer(conf, percpu);
@@ -5879,7 +6283,6 @@ static struct r5conf *setup_conf(struct mddev *mddev)
        else
                conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks;
        max_disks = max(conf->raid_disks, conf->previous_raid_disks);
-       conf->scribble_len = scribble_len(max_disks);
 
        conf->disks = kzalloc(max_disks * sizeof(struct disk_info),
                              GFP_KERNEL);
@@ -5907,6 +6310,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
                INIT_LIST_HEAD(conf->temp_inactive_list + i);
 
        conf->level = mddev->new_level;
+       conf->chunk_sectors = mddev->new_chunk_sectors;
        if (raid5_alloc_percpu(conf) != 0)
                goto abort;
 
@@ -5939,7 +6343,6 @@ static struct r5conf *setup_conf(struct mddev *mddev)
                        conf->fullsync = 1;
        }
 
-       conf->chunk_sectors = mddev->new_chunk_sectors;
        conf->level = mddev->new_level;
        if (conf->level == 6)
                conf->max_degraded = 2;