#include <linux/slab.h>
#include <linux/ratelimit.h>
#include <linux/nodemask.h>
+#include <linux/flex_array.h>
#include <trace/events/block.h>
#include "md.h"
BUG_ON(atomic_read(&sh->count) != 0);
BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
BUG_ON(stripe_operations_active(sh));
+ BUG_ON(sh->batch_head);
pr_debug("init_stripe called, stripe %llu\n",
(unsigned long long)sector);
}
if (read_seqcount_retry(&conf->gen_lock, seq))
goto retry;
+ sh->overwrite_disks = 0;
insert_hash(conf, sh);
sh->cpu = smp_processor_id();
+ set_bit(STRIPE_BATCH_READY, &sh->state);
}
static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
return sh;
}
+static bool is_full_stripe_write(struct stripe_head *sh)
+{
+ BUG_ON(sh->overwrite_disks > (sh->disks - sh->raid_conf->max_degraded));
+ return sh->overwrite_disks == (sh->disks - sh->raid_conf->max_degraded);
+}
+
+static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
+{
+ local_irq_disable();
+ if (sh1 > sh2) {
+ spin_lock(&sh2->stripe_lock);
+ spin_lock_nested(&sh1->stripe_lock, 1);
+ } else {
+ spin_lock(&sh1->stripe_lock);
+ spin_lock_nested(&sh2->stripe_lock, 1);
+ }
+}
+
+static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
+{
+ spin_unlock(&sh1->stripe_lock);
+ spin_unlock(&sh2->stripe_lock);
+ local_irq_enable();
+}
+
+/* Only freshly new full stripe normal write stripe can be added to a batch list */
+static bool stripe_can_batch(struct stripe_head *sh)
+{
+ return test_bit(STRIPE_BATCH_READY, &sh->state) &&
+ is_full_stripe_write(sh);
+}
+
+/* we only do back search */
+static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh)
+{
+ struct stripe_head *head;
+ sector_t head_sector, tmp_sec;
+ int hash;
+ int dd_idx;
+
+ if (!stripe_can_batch(sh))
+ return;
+ /* Don't cross chunks, so stripe pd_idx/qd_idx is the same */
+ tmp_sec = sh->sector;
+ if (!sector_div(tmp_sec, conf->chunk_sectors))
+ return;
+ head_sector = sh->sector - STRIPE_SECTORS;
+
+ hash = stripe_hash_locks_hash(head_sector);
+ spin_lock_irq(conf->hash_locks + hash);
+ head = __find_stripe(conf, head_sector, conf->generation);
+ if (head && !atomic_inc_not_zero(&head->count)) {
+ spin_lock(&conf->device_lock);
+ if (!atomic_read(&head->count)) {
+ if (!test_bit(STRIPE_HANDLE, &head->state))
+ atomic_inc(&conf->active_stripes);
+ BUG_ON(list_empty(&head->lru) &&
+ !test_bit(STRIPE_EXPANDING, &head->state));
+ list_del_init(&head->lru);
+ if (head->group) {
+ head->group->stripes_cnt--;
+ head->group = NULL;
+ }
+ }
+ atomic_inc(&head->count);
+ spin_unlock(&conf->device_lock);
+ }
+ spin_unlock_irq(conf->hash_locks + hash);
+
+ if (!head)
+ return;
+ if (!stripe_can_batch(head))
+ goto out;
+
+ lock_two_stripes(head, sh);
+ /* clear_batch_ready clear the flag */
+ if (!stripe_can_batch(head) || !stripe_can_batch(sh))
+ goto unlock_out;
+
+ if (sh->batch_head)
+ goto unlock_out;
+
+ dd_idx = 0;
+ while (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx)
+ dd_idx++;
+ if (head->dev[dd_idx].towrite->bi_rw != sh->dev[dd_idx].towrite->bi_rw)
+ goto unlock_out;
+
+ if (head->batch_head) {
+ spin_lock(&head->batch_head->batch_lock);
+ /* This batch list is already running */
+ if (!stripe_can_batch(head)) {
+ spin_unlock(&head->batch_head->batch_lock);
+ goto unlock_out;
+ }
+
+ /*
+ * at this point, head's BATCH_READY could be cleared, but we
+ * can still add the stripe to batch list
+ */
+ list_add(&sh->batch_list, &head->batch_list);
+ spin_unlock(&head->batch_head->batch_lock);
+
+ sh->batch_head = head->batch_head;
+ } else {
+ head->batch_head = head;
+ sh->batch_head = head->batch_head;
+ spin_lock(&head->batch_lock);
+ list_add_tail(&sh->batch_list, &head->batch_list);
+ spin_unlock(&head->batch_lock);
+ }
+
+ if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+ if (atomic_dec_return(&conf->preread_active_stripes)
+ < IO_THRESHOLD)
+ md_wakeup_thread(conf->mddev->thread);
+
+ atomic_inc(&sh->count);
+unlock_out:
+ unlock_two_stripes(head, sh);
+out:
+ release_stripe(head);
+}
+
/* Determine if 'data_offset' or 'new_data_offset' should be used
* in this stripe_head.
*/
{
struct r5conf *conf = sh->raid_conf;
int i, disks = sh->disks;
+ struct stripe_head *head_sh = sh;
might_sleep();
int replace_only = 0;
struct bio *bi, *rbi;
struct md_rdev *rdev, *rrdev = NULL;
+
+ sh = head_sh;
if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
rw = WRITE_FUA;
if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags))
rw |= REQ_SYNC;
+again:
bi = &sh->dev[i].req;
rbi = &sh->dev[i].rreq; /* For writing to replacement */
/* We raced and saw duplicates */
rrdev = NULL;
} else {
- if (test_bit(R5_ReadRepl, &sh->dev[i].flags) && rrdev)
+ if (test_bit(R5_ReadRepl, &head_sh->dev[i].flags) && rrdev)
rdev = rrdev;
rrdev = NULL;
}
__func__, (unsigned long long)sh->sector,
bi->bi_rw, i);
atomic_inc(&sh->count);
+ if (sh != head_sh)
+ atomic_inc(&head_sh->count);
if (use_new_offset(conf, sh))
bi->bi_iter.bi_sector = (sh->sector
+ rdev->new_data_offset);
else
bi->bi_iter.bi_sector = (sh->sector
+ rdev->data_offset);
- if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
+ if (test_bit(R5_ReadNoMerge, &head_sh->dev[i].flags))
bi->bi_rw |= REQ_NOMERGE;
if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
__func__, (unsigned long long)sh->sector,
rbi->bi_rw, i);
atomic_inc(&sh->count);
+ if (sh != head_sh)
+ atomic_inc(&head_sh->count);
if (use_new_offset(conf, sh))
rbi->bi_iter.bi_sector = (sh->sector
+ rrdev->new_data_offset);
pr_debug("skip op %ld on disc %d for sector %llu\n",
bi->bi_rw, i, (unsigned long long)sh->sector);
clear_bit(R5_LOCKED, &sh->dev[i].flags);
+ if (sh->batch_head)
+ set_bit(STRIPE_BATCH_ERR,
+ &sh->batch_head->state);
set_bit(STRIPE_HANDLE, &sh->state);
}
+
+ if (!head_sh->batch_head)
+ continue;
+ sh = list_first_entry(&sh->batch_list, struct stripe_head,
+ batch_list);
+ if (sh != head_sh)
+ goto again;
}
}
struct async_submit_ctl submit;
int i;
+ BUG_ON(sh->batch_head);
pr_debug("%s: stripe %llu\n", __func__,
(unsigned long long)sh->sector);
/* return a pointer to the address conversion region of the scribble buffer */
static addr_conv_t *to_addr_conv(struct stripe_head *sh,
- struct raid5_percpu *percpu)
+ struct raid5_percpu *percpu, int i)
+{
+ void *addr;
+
+ addr = flex_array_get(percpu->scribble, i);
+ return addr + sizeof(struct page *) * (sh->disks + 2);
+}
+
+/* return a pointer to the address conversion region of the scribble buffer */
+static struct page **to_addr_page(struct raid5_percpu *percpu, int i)
{
- return percpu->scribble + sizeof(struct page *) * (sh->disks + 2);
+ void *addr;
+
+ addr = flex_array_get(percpu->scribble, i);
+ return addr;
}
static struct dma_async_tx_descriptor *
ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
{
int disks = sh->disks;
- struct page **xor_srcs = percpu->scribble;
+ struct page **xor_srcs = to_addr_page(percpu, 0);
int target = sh->ops.target;
struct r5dev *tgt = &sh->dev[target];
struct page *xor_dest = tgt->page;
struct async_submit_ctl submit;
int i;
+ BUG_ON(sh->batch_head);
+
pr_debug("%s: stripe %llu block: %d\n",
__func__, (unsigned long long)sh->sector, target);
BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
atomic_inc(&sh->count);
init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
- ops_complete_compute, sh, to_addr_conv(sh, percpu));
+ ops_complete_compute, sh, to_addr_conv(sh, percpu, 0));
if (unlikely(count == 1))
tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
else
ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
{
int disks = sh->disks;
- struct page **blocks = percpu->scribble;
+ struct page **blocks = to_addr_page(percpu, 0);
int target;
int qd_idx = sh->qd_idx;
struct dma_async_tx_descriptor *tx;
int i;
int count;
+ BUG_ON(sh->batch_head);
if (sh->ops.target < 0)
target = sh->ops.target2;
else if (sh->ops.target2 < 0)
BUG_ON(blocks[count+1] != dest); /* q should already be set */
init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
ops_complete_compute, sh,
- to_addr_conv(sh, percpu));
+ to_addr_conv(sh, percpu, 0));
tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
} else {
/* Compute any data- or p-drive using XOR */
init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
NULL, ops_complete_compute, sh,
- to_addr_conv(sh, percpu));
+ to_addr_conv(sh, percpu, 0));
tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit);
}
struct r5dev *tgt = &sh->dev[target];
struct r5dev *tgt2 = &sh->dev[target2];
struct dma_async_tx_descriptor *tx;
- struct page **blocks = percpu->scribble;
+ struct page **blocks = to_addr_page(percpu, 0);
struct async_submit_ctl submit;
+ BUG_ON(sh->batch_head);
pr_debug("%s: stripe %llu block1: %d block2: %d\n",
__func__, (unsigned long long)sh->sector, target, target2);
BUG_ON(target < 0 || target2 < 0);
/* Missing P+Q, just recompute */
init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
ops_complete_compute, sh,
- to_addr_conv(sh, percpu));
+ to_addr_conv(sh, percpu, 0));
return async_gen_syndrome(blocks, 0, syndrome_disks+2,
STRIPE_SIZE, &submit);
} else {
init_async_submit(&submit,
ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
NULL, NULL, NULL,
- to_addr_conv(sh, percpu));
+ to_addr_conv(sh, percpu, 0));
tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
&submit);
count = set_syndrome_sources(blocks, sh);
init_async_submit(&submit, ASYNC_TX_FENCE, tx,
ops_complete_compute, sh,
- to_addr_conv(sh, percpu));
+ to_addr_conv(sh, percpu, 0));
return async_gen_syndrome(blocks, 0, count+2,
STRIPE_SIZE, &submit);
}
} else {
init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
ops_complete_compute, sh,
- to_addr_conv(sh, percpu));
+ to_addr_conv(sh, percpu, 0));
if (failb == syndrome_disks) {
/* We're missing D+P. */
return async_raid6_datap_recov(syndrome_disks+2,
struct dma_async_tx_descriptor *tx)
{
int disks = sh->disks;
- struct page **xor_srcs = percpu->scribble;
+ struct page **xor_srcs = to_addr_page(percpu, 0);
int count = 0, pd_idx = sh->pd_idx, i;
struct async_submit_ctl submit;
/* existing parity data subtracted */
struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
+ BUG_ON(sh->batch_head);
pr_debug("%s: stripe %llu\n", __func__,
(unsigned long long)sh->sector);
}
init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
- ops_complete_prexor, sh, to_addr_conv(sh, percpu));
+ ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
return tx;
{
int disks = sh->disks;
int i;
+ struct stripe_head *head_sh = sh;
pr_debug("%s: stripe %llu\n", __func__,
(unsigned long long)sh->sector);
for (i = disks; i--; ) {
- struct r5dev *dev = &sh->dev[i];
+ struct r5dev *dev;
struct bio *chosen;
- if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) {
+ sh = head_sh;
+ if (test_and_clear_bit(R5_Wantdrain, &head_sh->dev[i].flags)) {
struct bio *wbi;
+again:
+ dev = &sh->dev[i];
spin_lock_irq(&sh->stripe_lock);
chosen = dev->towrite;
dev->towrite = NULL;
+ sh->overwrite_disks = 0;
BUG_ON(dev->written);
wbi = dev->written = chosen;
spin_unlock_irq(&sh->stripe_lock);
}
wbi = r5_next_bio(wbi, dev->sector);
}
+
+ if (head_sh->batch_head) {
+ sh = list_first_entry(&sh->batch_list,
+ struct stripe_head,
+ batch_list);
+ if (sh == head_sh)
+ continue;
+ goto again;
+ }
}
}
struct dma_async_tx_descriptor *tx)
{
int disks = sh->disks;
- struct page **xor_srcs = percpu->scribble;
+ struct page **xor_srcs;
struct async_submit_ctl submit;
- int count = 0, pd_idx = sh->pd_idx, i;
+ int count, pd_idx = sh->pd_idx, i;
struct page *xor_dest;
int prexor = 0;
unsigned long flags;
+ int j = 0;
+ struct stripe_head *head_sh = sh;
+ int last_stripe;
pr_debug("%s: stripe %llu\n", __func__,
(unsigned long long)sh->sector);
ops_complete_reconstruct(sh);
return;
}
+again:
+ count = 0;
+ xor_srcs = to_addr_page(percpu, j);
/* check if prexor is active which means only process blocks
* that are part of a read-modify-write (written)
*/
- if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
+ if (head_sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
prexor = 1;
xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
- if (dev->written)
+ if (head_sh->dev[i].written)
xor_srcs[count++] = dev->page;
}
} else {
* set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
* for the synchronous xor case
*/
- flags = ASYNC_TX_ACK |
- (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
-
- atomic_inc(&sh->count);
+ last_stripe = !head_sh->batch_head ||
+ list_first_entry(&sh->batch_list,
+ struct stripe_head, batch_list) == head_sh;
+ if (last_stripe) {
+ flags = ASYNC_TX_ACK |
+ (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
+
+ atomic_inc(&head_sh->count);
+ init_async_submit(&submit, flags, tx, ops_complete_reconstruct, head_sh,
+ to_addr_conv(sh, percpu, j));
+ } else {
+ flags = prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST;
+ init_async_submit(&submit, flags, tx, NULL, NULL,
+ to_addr_conv(sh, percpu, j));
+ }
- init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh,
- to_addr_conv(sh, percpu));
if (unlikely(count == 1))
tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
else
tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
+ if (!last_stripe) {
+ j++;
+ sh = list_first_entry(&sh->batch_list, struct stripe_head,
+ batch_list);
+ goto again;
+ }
}
static void
struct dma_async_tx_descriptor *tx)
{
struct async_submit_ctl submit;
- struct page **blocks = percpu->scribble;
- int count, i;
+ struct page **blocks;
+ int count, i, j = 0;
+ struct stripe_head *head_sh = sh;
+ int last_stripe;
pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
return;
}
+again:
+ blocks = to_addr_page(percpu, j);
count = set_syndrome_sources(blocks, sh);
-
- atomic_inc(&sh->count);
-
- init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct,
- sh, to_addr_conv(sh, percpu));
+ last_stripe = !head_sh->batch_head ||
+ list_first_entry(&sh->batch_list,
+ struct stripe_head, batch_list) == head_sh;
+
+ if (last_stripe) {
+ atomic_inc(&head_sh->count);
+ init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct,
+ head_sh, to_addr_conv(sh, percpu, j));
+ } else
+ init_async_submit(&submit, 0, tx, NULL, NULL,
+ to_addr_conv(sh, percpu, j));
async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
+ if (!last_stripe) {
+ j++;
+ sh = list_first_entry(&sh->batch_list, struct stripe_head,
+ batch_list);
+ goto again;
+ }
}
static void ops_complete_check(void *stripe_head_ref)
int pd_idx = sh->pd_idx;
int qd_idx = sh->qd_idx;
struct page *xor_dest;
- struct page **xor_srcs = percpu->scribble;
+ struct page **xor_srcs = to_addr_page(percpu, 0);
struct dma_async_tx_descriptor *tx;
struct async_submit_ctl submit;
int count;
pr_debug("%s: stripe %llu\n", __func__,
(unsigned long long)sh->sector);
+ BUG_ON(sh->batch_head);
count = 0;
xor_dest = sh->dev[pd_idx].page;
xor_srcs[count++] = xor_dest;
}
init_async_submit(&submit, 0, NULL, NULL, NULL,
- to_addr_conv(sh, percpu));
+ to_addr_conv(sh, percpu, 0));
tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
&sh->ops.zero_sum_result, &submit);
static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp)
{
- struct page **srcs = percpu->scribble;
+ struct page **srcs = to_addr_page(percpu, 0);
struct async_submit_ctl submit;
int count;
pr_debug("%s: stripe %llu checkp: %d\n", __func__,
(unsigned long long)sh->sector, checkp);
+ BUG_ON(sh->batch_head);
count = set_syndrome_sources(srcs, sh);
if (!checkp)
srcs[count] = NULL;
atomic_inc(&sh->count);
init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check,
- sh, to_addr_conv(sh, percpu));
+ sh, to_addr_conv(sh, percpu, 0));
async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE,
&sh->ops.zero_sum_result, percpu->spare_page, &submit);
}
BUG();
}
- if (overlap_clear)
+ if (overlap_clear && !sh->batch_head)
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
if (test_and_clear_bit(R5_Overlap, &dev->flags))
atomic_set(&sh->count, 1);
atomic_inc(&conf->active_stripes);
INIT_LIST_HEAD(&sh->lru);
+
+ spin_lock_init(&sh->batch_lock);
+ INIT_LIST_HEAD(&sh->batch_list);
+ sh->batch_head = NULL;
release_stripe(sh);
return 1;
}
* calculate over all devices (not just the data blocks), using zeros in place
* of the P and Q blocks.
*/
-static size_t scribble_len(int num)
+static struct flex_array *scribble_alloc(int num, int cnt, gfp_t flags)
{
+ struct flex_array *ret;
size_t len;
len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2);
-
- return len;
+ ret = flex_array_alloc(len, cnt, flags);
+ if (!ret)
+ return NULL;
+ /* always prealloc all elements, so no locking is required */
+ if (flex_array_prealloc(ret, 0, cnt, flags)) {
+ flex_array_free(ret);
+ return NULL;
+ }
+ return ret;
}
static int resize_stripes(struct r5conf *conf, int newsize)
err = -ENOMEM;
get_online_cpus();
- conf->scribble_len = scribble_len(newsize);
for_each_present_cpu(cpu) {
struct raid5_percpu *percpu;
- void *scribble;
+ struct flex_array *scribble;
percpu = per_cpu_ptr(conf->percpu, cpu);
- scribble = kmalloc(conf->scribble_len, GFP_NOIO);
+ scribble = scribble_alloc(newsize, conf->chunk_sectors /
+ STRIPE_SECTORS, GFP_NOIO);
if (scribble) {
- kfree(percpu->scribble);
+ flex_array_free(percpu->scribble);
percpu->scribble = scribble;
} else {
err = -ENOMEM;
}
rdev_dec_pending(rdev, conf->mddev);
+ if (sh->batch_head && !uptodate)
+ set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state);
+
if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
clear_bit(R5_LOCKED, &sh->dev[i].flags);
set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh);
+
+ if (sh->batch_head && sh != sh->batch_head)
+ release_stripe(sh->batch_head);
}
static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);
* toread/towrite point to the first in a chain.
* The bi_next chain must be in order.
*/
-static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
+static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
+ int forwrite, int previous)
{
struct bio **bip;
struct r5conf *conf = sh->raid_conf;
* protect it.
*/
spin_lock_irq(&sh->stripe_lock);
+ /* Don't allow new IO added to stripes in batch list */
+ if (sh->batch_head)
+ goto overlap;
if (forwrite) {
bip = &sh->dev[dd_idx].towrite;
if (*bip == NULL)
if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi))
goto overlap;
+ if (!forwrite || previous)
+ clear_bit(STRIPE_BATCH_READY, &sh->state);
+
BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
if (*bip)
bi->bi_next = *bip;
sector = bio_end_sector(bi);
}
if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
- set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
+ if (!test_and_set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags))
+ sh->overwrite_disks++;
}
pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
sh->bm_seq = conf->seq_flush+1;
set_bit(STRIPE_BIT_DELAY, &sh->state);
}
+
+ if (stripe_can_batch(sh))
+ stripe_add_to_batch_list(conf, sh);
return 1;
overlap:
struct bio **return_bi)
{
int i;
+ BUG_ON(sh->batch_head);
for (i = disks; i--; ) {
struct bio *bi;
int bitmap_end = 0;
/* fail all writes first */
bi = sh->dev[i].towrite;
sh->dev[i].towrite = NULL;
+ sh->overwrite_disks = 0;
spin_unlock_irq(&sh->stripe_lock);
if (bi)
bitmap_end = 1;
int abort = 0;
int i;
+ BUG_ON(sh->batch_head);
clear_bit(STRIPE_SYNCING, &sh->state);
if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
wake_up(&conf->wait_for_overlap);
{
int i;
+ BUG_ON(sh->batch_head);
/* look for blocks to read/compute, skip this if a compute
* is already in flight, or if the stripe contents are in the
* midst of changing due to a write
int i;
struct r5dev *dev;
int discard_pending = 0;
+ struct stripe_head *head_sh = sh;
+ bool do_endio = false;
+ int wakeup_nr = 0;
for (i = disks; i--; )
if (sh->dev[i].written) {
clear_bit(R5_UPTODATE, &dev->flags);
if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) {
WARN_ON(test_bit(R5_UPTODATE, &dev->flags));
- dev->page = dev->orig_page;
}
+ do_endio = true;
+
+returnbi:
+ dev->page = dev->orig_page;
wbi = dev->written;
dev->written = NULL;
while (wbi && wbi->bi_iter.bi_sector <
STRIPE_SECTORS,
!test_bit(STRIPE_DEGRADED, &sh->state),
0);
+ if (head_sh->batch_head) {
+ sh = list_first_entry(&sh->batch_list,
+ struct stripe_head,
+ batch_list);
+ if (sh != head_sh) {
+ dev = &sh->dev[i];
+ goto returnbi;
+ }
+ }
+ sh = head_sh;
+ dev = &sh->dev[i];
} else if (test_bit(R5_Discard, &dev->flags))
discard_pending = 1;
WARN_ON(test_bit(R5_SkipCopy, &dev->flags));
* will be reinitialized
*/
spin_lock_irq(&conf->device_lock);
+unhash:
remove_hash(sh);
+ if (head_sh->batch_head) {
+ sh = list_first_entry(&sh->batch_list,
+ struct stripe_head, batch_list);
+ if (sh != head_sh)
+ goto unhash;
+ }
spin_unlock_irq(&conf->device_lock);
+ sh = head_sh;
+
if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state))
set_bit(STRIPE_HANDLE, &sh->state);
if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
if (atomic_dec_and_test(&conf->pending_full_writes))
md_wakeup_thread(conf->mddev->thread);
+
+ if (!head_sh->batch_head || !do_endio)
+ return;
+ for (i = 0; i < head_sh->disks; i++) {
+ if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags))
+ wakeup_nr++;
+ }
+ while (!list_empty(&head_sh->batch_list)) {
+ int i;
+ sh = list_first_entry(&head_sh->batch_list,
+ struct stripe_head, batch_list);
+ list_del_init(&sh->batch_list);
+
+ set_mask_bits(&sh->state, ~STRIPE_EXPAND_SYNC_FLAG,
+ head_sh->state & ~((1 << STRIPE_ACTIVE) |
+ (1 << STRIPE_PREREAD_ACTIVE) |
+ STRIPE_EXPAND_SYNC_FLAG));
+ sh->check_state = head_sh->check_state;
+ sh->reconstruct_state = head_sh->reconstruct_state;
+ for (i = 0; i < sh->disks; i++) {
+ if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
+ wakeup_nr++;
+ sh->dev[i].flags = head_sh->dev[i].flags;
+ }
+
+ spin_lock_irq(&sh->stripe_lock);
+ sh->batch_head = NULL;
+ spin_unlock_irq(&sh->stripe_lock);
+ if (sh->state & STRIPE_EXPAND_SYNC_FLAG)
+ set_bit(STRIPE_HANDLE, &sh->state);
+ release_stripe(sh);
+ }
+
+ spin_lock_irq(&head_sh->stripe_lock);
+ head_sh->batch_head = NULL;
+ spin_unlock_irq(&head_sh->stripe_lock);
+ wake_up_nr(&conf->wait_for_overlap, wakeup_nr);
+ if (head_sh->state & STRIPE_EXPAND_SYNC_FLAG)
+ set_bit(STRIPE_HANDLE, &head_sh->state);
}
static void handle_stripe_dirtying(struct r5conf *conf,
{
struct r5dev *dev = NULL;
+ BUG_ON(sh->batch_head);
set_bit(STRIPE_HANDLE, &sh->state);
switch (sh->check_state) {
int qd_idx = sh->qd_idx;
struct r5dev *dev;
+ BUG_ON(sh->batch_head);
set_bit(STRIPE_HANDLE, &sh->state);
BUG_ON(s->failed > 2);
* copy some of them into a target stripe for expand.
*/
struct dma_async_tx_descriptor *tx = NULL;
+ BUG_ON(sh->batch_head);
clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
for (i = 0; i < sh->disks; i++)
if (i != sh->pd_idx && i != sh->qd_idx) {
memset(s, 0, sizeof(*s));
- s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
- s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
+ s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state) && !sh->batch_head;
+ s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state) && !sh->batch_head;
s->failed_num[0] = -1;
s->failed_num[1] = -1;
rcu_read_unlock();
}
+static int clear_batch_ready(struct stripe_head *sh)
+{
+ struct stripe_head *tmp;
+ if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state))
+ return 0;
+ spin_lock(&sh->stripe_lock);
+ if (!sh->batch_head) {
+ spin_unlock(&sh->stripe_lock);
+ return 0;
+ }
+
+ /*
+ * this stripe could be added to a batch list before we check
+ * BATCH_READY, skips it
+ */
+ if (sh->batch_head != sh) {
+ spin_unlock(&sh->stripe_lock);
+ return 1;
+ }
+ spin_lock(&sh->batch_lock);
+ list_for_each_entry(tmp, &sh->batch_list, batch_list)
+ clear_bit(STRIPE_BATCH_READY, &tmp->state);
+ spin_unlock(&sh->batch_lock);
+ spin_unlock(&sh->stripe_lock);
+
+ /*
+ * BATCH_READY is cleared, no new stripes can be added.
+ * batch_list can be accessed without lock
+ */
+ return 0;
+}
+
+static void check_break_stripe_batch_list(struct stripe_head *sh)
+{
+ struct stripe_head *head_sh, *next;
+ int i;
+
+ if (!test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state))
+ return;
+
+ head_sh = sh;
+ do {
+ sh = list_first_entry(&sh->batch_list,
+ struct stripe_head, batch_list);
+ BUG_ON(sh == head_sh);
+ } while (!test_bit(STRIPE_DEGRADED, &sh->state));
+
+ while (sh != head_sh) {
+ next = list_first_entry(&sh->batch_list,
+ struct stripe_head, batch_list);
+ list_del_init(&sh->batch_list);
+
+ set_mask_bits(&sh->state, ~STRIPE_EXPAND_SYNC_FLAG,
+ head_sh->state & ~((1 << STRIPE_ACTIVE) |
+ (1 << STRIPE_PREREAD_ACTIVE) |
+ (1 << STRIPE_DEGRADED) |
+ STRIPE_EXPAND_SYNC_FLAG));
+ sh->check_state = head_sh->check_state;
+ sh->reconstruct_state = head_sh->reconstruct_state;
+ for (i = 0; i < sh->disks; i++)
+ sh->dev[i].flags = head_sh->dev[i].flags &
+ (~((1 << R5_WriteError) | (1 << R5_Overlap)));
+
+ spin_lock_irq(&sh->stripe_lock);
+ sh->batch_head = NULL;
+ spin_unlock_irq(&sh->stripe_lock);
+
+ set_bit(STRIPE_HANDLE, &sh->state);
+ release_stripe(sh);
+
+ sh = next;
+ }
+}
+
static void handle_stripe(struct stripe_head *sh)
{
struct stripe_head_state s;
return;
}
- if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
+ if (clear_batch_ready(sh) ) {
+ clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
+ return;
+ }
+
+ check_break_stripe_batch_list(sh);
+
+ if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) {
spin_lock(&sh->stripe_lock);
/* Cannot process 'sync' concurrently with 'discard' */
if (!test_bit(STRIPE_DISCARD, &sh->state) &&
}
set_bit(STRIPE_DISCARD, &sh->state);
finish_wait(&conf->wait_for_overlap, &w);
+ sh->overwrite_disks = 0;
for (d = 0; d < conf->raid_disks; d++) {
if (d == sh->pd_idx || d == sh->qd_idx)
continue;
sh->dev[d].towrite = bi;
set_bit(R5_OVERWRITE, &sh->dev[d].flags);
raid5_inc_bi_active_stripes(bi);
+ sh->overwrite_disks++;
}
spin_unlock_irq(&sh->stripe_lock);
if (conf->mddev->bitmap) {
}
if (test_bit(STRIPE_EXPANDING, &sh->state) ||
- !add_stripe_bio(sh, bi, dd_idx, rw)) {
+ !add_stripe_bio(sh, bi, dd_idx, rw, previous)) {
/* Stripe is busy expanding or
* add failed due to overlap. Flush everything
* and wait a while
}
set_bit(STRIPE_HANDLE, &sh->state);
clear_bit(STRIPE_DELAYED, &sh->state);
- if ((bi->bi_rw & REQ_SYNC) &&
+ if ((!sh->batch_head || sh == sh->batch_head) &&
+ (bi->bi_rw & REQ_SYNC) &&
!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
atomic_inc(&conf->preread_active_stripes);
release_stripe_plug(mddev, sh);
return reshape_sectors;
}
-/* FIXME go_faster isn't used */
-static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster)
+static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped)
{
struct r5conf *conf = mddev->private;
struct stripe_head *sh;
return handled;
}
- if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) {
+ if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) {
release_stripe(sh);
raid5_set_bi_processed_stripes(raid_bio, scnt);
conf->retry_read_aligned = raid_bio;
static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
{
safe_put_page(percpu->spare_page);
- kfree(percpu->scribble);
+ if (percpu->scribble)
+ flex_array_free(percpu->scribble);
percpu->spare_page = NULL;
percpu->scribble = NULL;
}
if (conf->level == 6 && !percpu->spare_page)
percpu->spare_page = alloc_page(GFP_KERNEL);
if (!percpu->scribble)
- percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL);
+ percpu->scribble = scribble_alloc(max(conf->raid_disks,
+ conf->previous_raid_disks), conf->chunk_sectors /
+ STRIPE_SECTORS, GFP_KERNEL);
if (!percpu->scribble || (conf->level == 6 && !percpu->spare_page)) {
free_scratch_buffer(conf, percpu);
else
conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks;
max_disks = max(conf->raid_disks, conf->previous_raid_disks);
- conf->scribble_len = scribble_len(max_disks);
conf->disks = kzalloc(max_disks * sizeof(struct disk_info),
GFP_KERNEL);
INIT_LIST_HEAD(conf->temp_inactive_list + i);
conf->level = mddev->new_level;
+ conf->chunk_sectors = mddev->new_chunk_sectors;
if (raid5_alloc_percpu(conf) != 0)
goto abort;
conf->fullsync = 1;
}
- conf->chunk_sectors = mddev->new_chunk_sectors;
conf->level = mddev->new_level;
if (conf->level == 6)
conf->max_degraded = 2;