*/
#define BTRFS_MAX_MIRRORS (4 + 1)
+/* Represent one sector and its needed info to verify the content. */
+struct scrub_sector_verification {
+ bool is_metadata;
+
+ union {
+ /*
+ * Csum pointer for data csum verification. Should point to a
+ * sector csum inside scrub_stripe::csums.
+ *
+ * NULL if this data sector has no csum.
+ */
+ u8 *csum;
+
+ /*
+ * Extra info for metadata verification. All sectors inside a
+ * tree block share the same generation.
+ */
+ u64 generation;
+ };
+};
+
+enum scrub_stripe_flags {
+ /* Set when @mirror_num, @dev, @physical and @logical are set. */
+ SCRUB_STRIPE_FLAG_INITIALIZED,
+
+ /* Set when the read-repair is finished. */
+ SCRUB_STRIPE_FLAG_REPAIR_DONE,
+};
+
+#define SCRUB_STRIPE_PAGES (BTRFS_STRIPE_LEN / PAGE_SIZE)
+
+/*
+ * Represent one contiguous range with a length of BTRFS_STRIPE_LEN.
+ */
+struct scrub_stripe {
+ struct btrfs_block_group *bg;
+
+ struct page *pages[SCRUB_STRIPE_PAGES];
+ struct scrub_sector_verification *sectors;
+
+ struct btrfs_device *dev;
+ u64 logical;
+ u64 physical;
+
+ u16 mirror_num;
+
+ /* Should be BTRFS_STRIPE_LEN / sectorsize. */
+ u16 nr_sectors;
+
+ atomic_t pending_io;
+ wait_queue_head_t io_wait;
+
+ /*
+ * Indicate the states of the stripe. Bits are defined in
+ * scrub_stripe_flags enum.
+ */
+ unsigned long state;
+
+ /* Indicate which sectors are covered by extent items. */
+ unsigned long extent_sector_bitmap;
+
+ /*
+ * The errors hit during the initial read of the stripe.
+ *
+ * Would be utilized for error reporting and repair.
+ */
+ unsigned long init_error_bitmap;
+
+ /*
+ * The following error bitmaps are all for the current status.
+ * Every time we submit a new read, these bitmaps may be updated.
+ *
+ * error_bitmap = io_error_bitmap | csum_error_bitmap | meta_error_bitmap;
+ *
+ * IO and csum errors can happen for both metadata and data.
+ */
+ unsigned long error_bitmap;
+ unsigned long io_error_bitmap;
+ unsigned long csum_error_bitmap;
+ unsigned long meta_error_bitmap;
+
+ /*
+ * Checksum for the whole stripe if this stripe is inside a data block
+ * group.
+ */
+ u8 *csums;
+};
+
struct scrub_recover {
refcount_t refs;
struct btrfs_io_context *bioc;
#endif
}
+static void release_scrub_stripe(struct scrub_stripe *stripe)
+{
+ if (!stripe)
+ return;
+
+ for (int i = 0; i < SCRUB_STRIPE_PAGES; i++) {
+ if (stripe->pages[i])
+ __free_page(stripe->pages[i]);
+ stripe->pages[i] = NULL;
+ }
+ kfree(stripe->sectors);
+ kfree(stripe->csums);
+ stripe->sectors = NULL;
+ stripe->csums = NULL;
+ stripe->state = 0;
+}
+
+int init_scrub_stripe(struct btrfs_fs_info *fs_info, struct scrub_stripe *stripe)
+{
+ int ret;
+
+ memset(stripe, 0, sizeof(*stripe));
+
+ stripe->nr_sectors = BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits;
+ stripe->state = 0;
+
+ init_waitqueue_head(&stripe->io_wait);
+ atomic_set(&stripe->pending_io, 0);
+
+ ret = btrfs_alloc_page_array(SCRUB_STRIPE_PAGES, stripe->pages);
+ if (ret < 0)
+ goto error;
+
+ stripe->sectors = kcalloc(stripe->nr_sectors,
+ sizeof(struct scrub_sector_verification),
+ GFP_KERNEL);
+ if (!stripe->sectors)
+ goto error;
+
+ stripe->csums = kcalloc(BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits,
+ fs_info->csum_size, GFP_KERNEL);
+ if (!stripe->csums)
+ goto error;
+ return 0;
+error:
+ release_scrub_stripe(stripe);
+ return -ENOMEM;
+}
+
+void wait_scrub_stripe_io(struct scrub_stripe *stripe)
+{
+ wait_event(stripe->io_wait, atomic_read(&stripe->pending_io) == 0);
+}
+
static struct scrub_block *alloc_scrub_block(struct scrub_ctx *sctx,
struct btrfs_device *dev,
u64 logical, u64 physical,
static void scrub_bio_end_io(struct bio *bio);
static void scrub_bio_end_io_worker(struct work_struct *work);
static void scrub_block_complete(struct scrub_block *sblock);
-static void scrub_find_good_copy(struct btrfs_fs_info *fs_info,
- u64 extent_logical, u32 extent_len,
- u64 *extent_physical,
- struct btrfs_device **extent_dev,
- int *extent_mirror_num);
static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx,
struct scrub_sector *sector);
static void scrub_wr_submit(struct scrub_ctx *sctx);
sblock_other = sblocks_for_recheck[mirror_index];
} else {
struct scrub_recover *r = sblock_bad->sectors[0]->recover;
- int max_allowed = r->bioc->num_stripes - r->bioc->num_tgtdevs;
+ int max_allowed = r->bioc->num_stripes - r->bioc->replace_nr_stripes;
if (mirror_index >= max_allowed)
break;
}
static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
- u64 *raid_map,
+ u64 full_stripe_logical,
int nstripes, int mirror,
int *stripe_index,
u64 *stripe_offset)
int i;
if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+ const int nr_data_stripes = (map_type & BTRFS_BLOCK_GROUP_RAID5) ?
+ nstripes - 1 : nstripes - 2;
+
/* RAID5/6 */
- for (i = 0; i < nstripes; i++) {
- if (raid_map[i] == RAID6_Q_STRIPE ||
- raid_map[i] == RAID5_P_STRIPE)
- continue;
+ for (i = 0; i < nr_data_stripes; i++) {
+ const u64 data_stripe_start = full_stripe_logical +
+ (i * BTRFS_STRIPE_LEN);
- if (logical >= raid_map[i] &&
- logical < raid_map[i] + BTRFS_STRIPE_LEN)
+ if (logical >= data_stripe_start &&
+ logical < data_stripe_start + BTRFS_STRIPE_LEN)
break;
}
*stripe_index = i;
- *stripe_offset = logical - raid_map[i];
+ *stripe_offset = (logical - full_stripe_logical) &
+ BTRFS_STRIPE_LEN_MASK;
} else {
/* The other RAID type */
*stripe_index = mirror;
scrub_stripe_index_and_offset(logical,
bioc->map_type,
- bioc->raid_map,
+ bioc->full_stripe_logical,
bioc->num_stripes -
- bioc->num_tgtdevs,
+ bioc->replace_nr_stripes,
mirror_index,
&stripe_index,
&stripe_offset);
return sblock->checksum_error;
}
+static struct page *scrub_stripe_get_page(struct scrub_stripe *stripe, int sector_nr)
+{
+ struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
+ int page_index = (sector_nr << fs_info->sectorsize_bits) >> PAGE_SHIFT;
+
+ return stripe->pages[page_index];
+}
+
+static unsigned int scrub_stripe_get_page_offset(struct scrub_stripe *stripe,
+ int sector_nr)
+{
+ struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
+
+ return offset_in_page(sector_nr << fs_info->sectorsize_bits);
+}
+
+void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr)
+{
+ struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
+ const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits;
+ const u64 logical = stripe->logical + (sector_nr << fs_info->sectorsize_bits);
+ const struct page *first_page = scrub_stripe_get_page(stripe, sector_nr);
+ const unsigned int first_off = scrub_stripe_get_page_offset(stripe, sector_nr);
+ SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
+ u8 on_disk_csum[BTRFS_CSUM_SIZE];
+ u8 calculated_csum[BTRFS_CSUM_SIZE];
+ struct btrfs_header *header;
+
+ /*
+ * Here we don't have a good way to attach the pages (and subpages)
+ * to a dummy extent buffer, thus we have to directly grab the members
+ * from pages.
+ */
+ header = (struct btrfs_header *)(page_address(first_page) + first_off);
+ memcpy(on_disk_csum, header->csum, fs_info->csum_size);
+
+ if (logical != btrfs_stack_header_bytenr(header)) {
+ bitmap_set(&stripe->csum_error_bitmap, sector_nr, sectors_per_tree);
+ bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
+ btrfs_warn_rl(fs_info,
+ "tree block %llu mirror %u has bad bytenr, has %llu want %llu",
+ logical, stripe->mirror_num,
+ btrfs_stack_header_bytenr(header), logical);
+ return;
+ }
+ if (memcmp(header->fsid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE) != 0) {
+ bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
+ bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
+ btrfs_warn_rl(fs_info,
+ "tree block %llu mirror %u has bad fsid, has %pU want %pU",
+ logical, stripe->mirror_num,
+ header->fsid, fs_info->fs_devices->fsid);
+ return;
+ }
+ if (memcmp(header->chunk_tree_uuid, fs_info->chunk_tree_uuid,
+ BTRFS_UUID_SIZE) != 0) {
+ bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
+ bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
+ btrfs_warn_rl(fs_info,
+ "tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU",
+ logical, stripe->mirror_num,
+ header->chunk_tree_uuid, fs_info->chunk_tree_uuid);
+ return;
+ }
+
+ /* Now check tree block csum. */
+ shash->tfm = fs_info->csum_shash;
+ crypto_shash_init(shash);
+ crypto_shash_update(shash, page_address(first_page) + first_off +
+ BTRFS_CSUM_SIZE, fs_info->sectorsize - BTRFS_CSUM_SIZE);
+
+ for (int i = sector_nr + 1; i < sector_nr + sectors_per_tree; i++) {
+ struct page *page = scrub_stripe_get_page(stripe, i);
+ unsigned int page_off = scrub_stripe_get_page_offset(stripe, i);
+
+ crypto_shash_update(shash, page_address(page) + page_off,
+ fs_info->sectorsize);
+ }
+
+ crypto_shash_final(shash, calculated_csum);
+ if (memcmp(calculated_csum, on_disk_csum, fs_info->csum_size) != 0) {
+ bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
+ bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
+ btrfs_warn_rl(fs_info,
+ "tree block %llu mirror %u has bad csum, has " CSUM_FMT " want " CSUM_FMT,
+ logical, stripe->mirror_num,
+ CSUM_FMT_VALUE(fs_info->csum_size, on_disk_csum),
+ CSUM_FMT_VALUE(fs_info->csum_size, calculated_csum));
+ return;
+ }
+ if (stripe->sectors[sector_nr].generation !=
+ btrfs_stack_header_generation(header)) {
+ bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
+ bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
+ btrfs_warn_rl(fs_info,
+ "tree block %llu mirror %u has bad generation, has %llu want %llu",
+ logical, stripe->mirror_num,
+ btrfs_stack_header_generation(header),
+ stripe->sectors[sector_nr].generation);
+ return;
+ }
+ bitmap_clear(&stripe->error_bitmap, sector_nr, sectors_per_tree);
+ bitmap_clear(&stripe->csum_error_bitmap, sector_nr, sectors_per_tree);
+ bitmap_clear(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
+}
+
static int scrub_checksum_tree_block(struct scrub_block *sblock)
{
struct scrub_ctx *sctx = sblock->sctx;
btrfs_bio_counter_inc_blocked(fs_info);
ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
&length, &bioc);
- if (ret || !bioc || !bioc->raid_map)
+ if (ret || !bioc)
goto bioc_out;
if (WARN_ON(!sctx->is_dev_replace ||
return 1;
}
+static bool should_use_device(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *dev,
+ bool follow_replace_read_mode)
+{
+ struct btrfs_device *replace_srcdev = fs_info->dev_replace.srcdev;
+ struct btrfs_device *replace_tgtdev = fs_info->dev_replace.tgtdev;
+
+ if (!dev->bdev)
+ return false;
+
+ /*
+ * We're doing scrub/replace, if it's pure scrub, no tgtdev should be
+ * here. If it's replace, we're going to write data to tgtdev, thus
+ * the current data of the tgtdev is all garbage, thus we can not use
+ * it at all.
+ */
+ if (dev == replace_tgtdev)
+ return false;
+
+ /* No need to follow replace read mode, any existing device is fine. */
+ if (!follow_replace_read_mode)
+ return true;
+
+ /* Need to follow the mode. */
+ if (fs_info->dev_replace.cont_reading_from_srcdev_mode ==
+ BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
+ return dev != replace_srcdev;
+ return true;
+}
+static int scrub_find_good_copy(struct btrfs_fs_info *fs_info,
+ u64 extent_logical, u32 extent_len,
+ u64 *extent_physical,
+ struct btrfs_device **extent_dev,
+ int *extent_mirror_num)
+{
+ u64 mapped_length;
+ struct btrfs_io_context *bioc = NULL;
+ int ret;
+ int i;
+
+ mapped_length = extent_len;
+ ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
+ extent_logical, &mapped_length, &bioc, 0);
+ if (ret || !bioc || mapped_length < extent_len) {
+ btrfs_put_bioc(bioc);
+ btrfs_err_rl(fs_info, "btrfs_map_block() failed for logical %llu: %d",
+ extent_logical, ret);
+ return -EIO;
+ }
+
+ /*
+ * First loop to exclude all missing devices and the source device if
+ * needed. And we don't want to use target device as mirror either, as
+ * we're doing the replace, the target device range contains nothing.
+ */
+ for (i = 0; i < bioc->num_stripes - bioc->replace_nr_stripes; i++) {
+ struct btrfs_io_stripe *stripe = &bioc->stripes[i];
+
+ if (!should_use_device(fs_info, stripe->dev, true))
+ continue;
+ goto found;
+ }
+ /*
+ * We didn't find any alternative mirrors, we have to break our replace
+ * read mode, or we can not read at all.
+ */
+ for (i = 0; i < bioc->num_stripes - bioc->replace_nr_stripes; i++) {
+ struct btrfs_io_stripe *stripe = &bioc->stripes[i];
+
+ if (!should_use_device(fs_info, stripe->dev, false))
+ continue;
+ goto found;
+ }
+
+ btrfs_err_rl(fs_info, "failed to find any live mirror for logical %llu",
+ extent_logical);
+ return -EIO;
+
+found:
+ *extent_physical = bioc->stripes[i].physical;
+ *extent_mirror_num = i + 1;
+ *extent_dev = bioc->stripes[i].dev;
+ btrfs_put_bioc(bioc);
+ return 0;
+}
+
+static bool scrub_need_different_mirror(struct scrub_ctx *sctx,
+ struct map_lookup *map,
+ struct btrfs_device *dev)
+{
+ /*
+ * For RAID56, all the extra mirrors are rebuilt from other P/Q,
+ * cannot utilize other mirrors directly.
+ */
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
+ return false;
+
+ if (!dev->bdev)
+ return true;
+
+ return sctx->fs_info->dev_replace.cont_reading_from_srcdev_mode ==
+ BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID;
+}
+
/* scrub extent tries to collect up to 64 kB for each bio */
static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
u64 logical, u32 len,
if (flags & BTRFS_EXTENT_FLAG_DATA) {
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
- blocksize = map->stripe_len;
+ blocksize = BTRFS_STRIPE_LEN;
else
blocksize = sctx->fs_info->sectorsize;
spin_lock(&sctx->stat_lock);
spin_unlock(&sctx->stat_lock);
} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
- blocksize = map->stripe_len;
+ blocksize = BTRFS_STRIPE_LEN;
else
blocksize = sctx->fs_info->nodesize;
spin_lock(&sctx->stat_lock);
}
/*
- * For dev-replace case, we can have @dev being a missing device.
- * Regular scrub will avoid its execution on missing device at all,
- * as that would trigger tons of read error.
- *
- * Reading from missing device will cause read error counts to
- * increase unnecessarily.
- * So here we change the read source to a good mirror.
+ * For dev-replace case, we can have @dev being a missing device, or
+ * we want to avoid reading from the source device if possible.
*/
- if (sctx->is_dev_replace && !dev->bdev)
- scrub_find_good_copy(sctx->fs_info, logical, len, &src_physical,
- &src_dev, &src_mirror);
+ if (sctx->is_dev_replace && scrub_need_different_mirror(sctx, map, dev)) {
+ ret = scrub_find_good_copy(sctx->fs_info, logical, len,
+ &src_physical, &src_dev, &src_mirror);
+ if (ret < 0)
+ return ret;
+ }
while (len) {
u32 l = min(len, blocksize);
int have_csum = 0;
{
int i;
int j = 0;
- u64 stripe_nr;
u64 last_offset;
- u32 stripe_index;
- u32 rot;
const int data_stripes = nr_data_stripes(map);
last_offset = (physical - map->stripes[num].physical) * data_stripes;
*offset = last_offset;
for (i = 0; i < data_stripes; i++) {
- *offset = last_offset + i * map->stripe_len;
+ u32 stripe_nr;
+ u32 stripe_index;
+ u32 rot;
+
+ *offset = last_offset + (i << BTRFS_STRIPE_LEN_SHIFT);
- stripe_nr = div64_u64(*offset, map->stripe_len);
- stripe_nr = div_u64(stripe_nr, data_stripes);
+ stripe_nr = (u32)(*offset >> BTRFS_STRIPE_LEN_SHIFT) / data_stripes;
/* Work out the disk rotation on this stripe-set */
- stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
+ rot = stripe_nr % map->num_stripes;
+ stripe_nr /= map->num_stripes;
/* calculate which stripe this data locates */
rot += i;
stripe_index = rot % map->num_stripes;
if (stripe_index < num)
j++;
}
- *offset = last_offset + j * map->stripe_len;
+ *offset = last_offset + (j << BTRFS_STRIPE_LEN_SHIFT);
return 1;
}
btrfs_bio_counter_inc_blocked(fs_info);
ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
&length, &bioc);
- if (ret || !bioc || !bioc->raid_map)
+ if (ret || !bioc)
goto bioc_out;
bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
/* Path must not be populated */
ASSERT(!path->nodes[0]);
- while (cur_logical < logical + map->stripe_len) {
+ while (cur_logical < logical + BTRFS_STRIPE_LEN) {
struct btrfs_io_context *bioc = NULL;
struct btrfs_device *extent_dev;
u64 extent_start;
u64 extent_mirror_num;
ret = find_first_extent_item(extent_root, path, cur_logical,
- logical + map->stripe_len - cur_logical);
+ logical + BTRFS_STRIPE_LEN - cur_logical);
/* No more extent item in this data stripe */
if (ret > 0) {
ret = 0;
/* Metadata should not cross stripe boundaries */
if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
does_range_cross_boundary(extent_start, extent_size,
- logical, map->stripe_len)) {
+ logical, BTRFS_STRIPE_LEN)) {
btrfs_err(fs_info,
"scrub: tree block %llu spanning stripes, ignored. logical=%llu",
extent_start, logical);
/* Truncate the range inside this data stripe */
extent_size = min(extent_start + extent_size,
- logical + map->stripe_len) - cur_logical;
+ logical + BTRFS_STRIPE_LEN) - cur_logical;
extent_start = cur_logical;
ASSERT(extent_size <= U32_MAX);
path->search_commit_root = 1;
path->skip_locking = 1;
- ASSERT(map->stripe_len <= U32_MAX);
- nsectors = map->stripe_len >> fs_info->sectorsize_bits;
+ nsectors = BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits;
ASSERT(nsectors <= BITS_PER_LONG);
sparity = kzalloc(sizeof(struct scrub_parity), GFP_NOFS);
if (!sparity) {
return -ENOMEM;
}
- ASSERT(map->stripe_len <= U32_MAX);
- sparity->stripe_len = map->stripe_len;
+ sparity->stripe_len = BTRFS_STRIPE_LEN;
sparity->nsectors = nsectors;
sparity->sctx = sctx;
sparity->scrub_dev = sdev;
ret = 0;
for (cur_logical = logic_start; cur_logical < logic_end;
- cur_logical += map->stripe_len) {
+ cur_logical += BTRFS_STRIPE_LEN) {
ret = scrub_raid56_data_stripe_for_parity(sctx, sparity, map,
sdev, path, cur_logical);
if (ret < 0)
return ret;
}
+static void fill_one_extent_info(struct btrfs_fs_info *fs_info,
+ struct scrub_stripe *stripe,
+ u64 extent_start, u64 extent_len,
+ u64 extent_flags, u64 extent_gen)
+{
+ for (u64 cur_logical = max(stripe->logical, extent_start);
+ cur_logical < min(stripe->logical + BTRFS_STRIPE_LEN,
+ extent_start + extent_len);
+ cur_logical += fs_info->sectorsize) {
+ const int nr_sector = (cur_logical - stripe->logical) >>
+ fs_info->sectorsize_bits;
+ struct scrub_sector_verification *sector =
+ &stripe->sectors[nr_sector];
+
+ set_bit(nr_sector, &stripe->extent_sector_bitmap);
+ if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+ sector->is_metadata = true;
+ sector->generation = extent_gen;
+ }
+ }
+}
+
+static void scrub_stripe_reset_bitmaps(struct scrub_stripe *stripe)
+{
+ stripe->extent_sector_bitmap = 0;
+ stripe->init_error_bitmap = 0;
+ stripe->error_bitmap = 0;
+ stripe->io_error_bitmap = 0;
+ stripe->csum_error_bitmap = 0;
+ stripe->meta_error_bitmap = 0;
+}
+
+/*
+ * Locate one stripe which has at least one extent in its range.
+ *
+ * Return 0 if found such stripe, and store its info into @stripe.
+ * Return >0 if there is no such stripe in the specified range.
+ * Return <0 for error.
+ */
+int scrub_find_fill_first_stripe(struct btrfs_block_group *bg,
+ struct btrfs_device *dev, u64 physical,
+ int mirror_num, u64 logical_start,
+ u32 logical_len, struct scrub_stripe *stripe)
+{
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+ struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bg->start);
+ struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bg->start);
+ const u64 logical_end = logical_start + logical_len;
+ struct btrfs_path path = { 0 };
+ u64 cur_logical = logical_start;
+ u64 stripe_end;
+ u64 extent_start;
+ u64 extent_len;
+ u64 extent_flags;
+ u64 extent_gen;
+ int ret;
+
+ memset(stripe->sectors, 0, sizeof(struct scrub_sector_verification) *
+ stripe->nr_sectors);
+ scrub_stripe_reset_bitmaps(stripe);
+
+ /* The range must be inside the bg. */
+ ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length);
+
+ path.search_commit_root = 1;
+ path.skip_locking = 1;
+
+ ret = find_first_extent_item(extent_root, &path, logical_start, logical_len);
+ /* Either error or not found. */
+ if (ret)
+ goto out;
+ get_extent_info(&path, &extent_start, &extent_len, &extent_flags, &extent_gen);
+ cur_logical = max(extent_start, cur_logical);
+
+ /*
+ * Round down to stripe boundary.
+ *
+ * The extra calculation against bg->start is to handle block groups
+ * whose logical bytenr is not BTRFS_STRIPE_LEN aligned.
+ */
+ stripe->logical = round_down(cur_logical - bg->start, BTRFS_STRIPE_LEN) +
+ bg->start;
+ stripe->physical = physical + stripe->logical - logical_start;
+ stripe->dev = dev;
+ stripe->bg = bg;
+ stripe->mirror_num = mirror_num;
+ stripe_end = stripe->logical + BTRFS_STRIPE_LEN - 1;
+
+ /* Fill the first extent info into stripe->sectors[] array. */
+ fill_one_extent_info(fs_info, stripe, extent_start, extent_len,
+ extent_flags, extent_gen);
+ cur_logical = extent_start + extent_len;
+
+ /* Fill the extent info for the remaining sectors. */
+ while (cur_logical <= stripe_end) {
+ ret = find_first_extent_item(extent_root, &path, cur_logical,
+ stripe_end - cur_logical + 1);
+ if (ret < 0)
+ goto out;
+ if (ret > 0) {
+ ret = 0;
+ break;
+ }
+ get_extent_info(&path, &extent_start, &extent_len,
+ &extent_flags, &extent_gen);
+ fill_one_extent_info(fs_info, stripe, extent_start, extent_len,
+ extent_flags, extent_gen);
+ cur_logical = extent_start + extent_len;
+ }
+
+ /* Now fill the data csum. */
+ if (bg->flags & BTRFS_BLOCK_GROUP_DATA) {
+ int sector_nr;
+ unsigned long csum_bitmap = 0;
+
+ /* Csum space should have already been allocated. */
+ ASSERT(stripe->csums);
+
+ /*
+ * Our csum bitmap should be large enough, as BTRFS_STRIPE_LEN
+ * should contain at most 16 sectors.
+ */
+ ASSERT(BITS_PER_LONG >= BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits);
+
+ ret = btrfs_lookup_csums_bitmap(csum_root, stripe->logical,
+ stripe_end, stripe->csums,
+ &csum_bitmap, true);
+ if (ret < 0)
+ goto out;
+ if (ret > 0)
+ ret = 0;
+
+ for_each_set_bit(sector_nr, &csum_bitmap, stripe->nr_sectors) {
+ stripe->sectors[sector_nr].csum = stripe->csums +
+ sector_nr * fs_info->csum_size;
+ }
+ }
+ set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state);
+out:
+ btrfs_release_path(&path);
+ return ret;
+}
+
/*
* Scrub one range which can only has simple mirror based profile.
* (Including all range in SINGLE/DUP/RAID1/RAID1C*, and each stripe in
* and @logical_length parameter.
*/
static int scrub_simple_mirror(struct scrub_ctx *sctx,
- struct btrfs_root *extent_root,
- struct btrfs_root *csum_root,
struct btrfs_block_group *bg,
struct map_lookup *map,
u64 logical_start, u64 logical_length,
u64 physical, int mirror_num)
{
struct btrfs_fs_info *fs_info = sctx->fs_info;
+ struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bg->start);
+ struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bg->start);
const u64 logical_end = logical_start + logical_length;
/* An artificial limit, inherit from old scrub behavior */
const u32 max_length = SZ_64K;
ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
BTRFS_BLOCK_GROUP_RAID10));
- return map->num_stripes / map->sub_stripes * map->stripe_len;
+ return (map->num_stripes / map->sub_stripes) << BTRFS_STRIPE_LEN_SHIFT;
}
/* Get the logical bytenr for the stripe */
* (stripe_index / sub_stripes) gives how many data stripes we need to
* skip.
*/
- return (stripe_index / map->sub_stripes) * map->stripe_len + bg->start;
+ return ((stripe_index / map->sub_stripes) << BTRFS_STRIPE_LEN_SHIFT) +
+ bg->start;
}
/* Get the mirror number for the stripe */
}
static int scrub_simple_stripe(struct scrub_ctx *sctx,
- struct btrfs_root *extent_root,
- struct btrfs_root *csum_root,
struct btrfs_block_group *bg,
struct map_lookup *map,
struct btrfs_device *device,
* just RAID1, so we can reuse scrub_simple_mirror() to scrub
* this stripe.
*/
- ret = scrub_simple_mirror(sctx, extent_root, csum_root, bg, map,
- cur_logical, map->stripe_len, device,
- cur_physical, mirror_num);
+ ret = scrub_simple_mirror(sctx, bg, map, cur_logical,
+ BTRFS_STRIPE_LEN, device, cur_physical,
+ mirror_num);
if (ret)
return ret;
/* Skip to next stripe which belongs to the target device */
cur_logical += logical_increment;
/* For physical offset, we just go to next stripe */
- cur_physical += map->stripe_len;
+ cur_physical += BTRFS_STRIPE_LEN;
}
return ret;
}
struct btrfs_device *scrub_dev,
int stripe_index)
{
- struct btrfs_path *path;
struct btrfs_fs_info *fs_info = sctx->fs_info;
- struct btrfs_root *root;
- struct btrfs_root *csum_root;
struct blk_plug plug;
struct map_lookup *map = em->map_lookup;
const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
u64 stripe_end;
int stop_loop = 0;
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- /*
- * work on commit root. The related disk blocks are static as
- * long as COW is applied. This means, it is save to rewrite
- * them to repair disk errors without any race conditions
- */
- path->search_commit_root = 1;
- path->skip_locking = 1;
- path->reada = READA_FORWARD;
-
wait_event(sctx->list_wait,
atomic_read(&sctx->bios_in_flight) == 0);
scrub_blocked_if_needed(fs_info);
- root = btrfs_extent_root(fs_info, bg->start);
- csum_root = btrfs_csum_root(fs_info, bg->start);
-
/*
* collect all data csums for the stripe to avoid seeking during
* the scrub. This might currently (crc32) end up to be about 1MB
* Only @physical and @mirror_num needs to calculated using
* @stripe_index.
*/
- ret = scrub_simple_mirror(sctx, root, csum_root, bg, map,
- bg->start, bg->length, scrub_dev,
- map->stripes[stripe_index].physical,
+ ret = scrub_simple_mirror(sctx, bg, map, bg->start, bg->length,
+ scrub_dev, map->stripes[stripe_index].physical,
stripe_index + 1);
offset = 0;
goto out;
}
if (profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
- ret = scrub_simple_stripe(sctx, root, csum_root, bg, map,
- scrub_dev, stripe_index);
- offset = map->stripe_len * (stripe_index / map->sub_stripes);
+ ret = scrub_simple_stripe(sctx, bg, map, scrub_dev, stripe_index);
+ offset = (stripe_index / map->sub_stripes) << BTRFS_STRIPE_LEN_SHIFT;
goto out;
}
/* Initialize @offset in case we need to go to out: label */
get_raid56_logic_offset(physical, stripe_index, map, &offset, NULL);
- increment = map->stripe_len * nr_data_stripes(map);
+ increment = nr_data_stripes(map) << BTRFS_STRIPE_LEN_SHIFT;
/*
* Due to the rotation, for RAID56 it's better to iterate each stripe
* We can reuse scrub_simple_mirror() here, as the repair part
* is still based on @mirror_num.
*/
- ret = scrub_simple_mirror(sctx, root, csum_root, bg, map,
- logical, map->stripe_len,
+ ret = scrub_simple_mirror(sctx, bg, map, logical, BTRFS_STRIPE_LEN,
scrub_dev, physical, 1);
if (ret < 0)
goto out;
next:
logical += increment;
- physical += map->stripe_len;
+ physical += BTRFS_STRIPE_LEN;
spin_lock(&sctx->stat_lock);
if (stop_loop)
sctx->stat.last_physical =
mutex_unlock(&sctx->wr_lock);
blk_finish_plug(&plug);
- btrfs_free_path(path);
if (sctx->is_dev_replace && ret >= 0) {
int ret2;
return ret;
}
+static int scrub_one_super(struct scrub_ctx *sctx, struct btrfs_device *dev,
+ struct page *page, u64 physical, u64 generation)
+{
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
+ struct bio_vec bvec;
+ struct bio bio;
+ struct btrfs_super_block *sb = page_address(page);
+ int ret;
+
+ bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_READ);
+ bio.bi_iter.bi_sector = physical >> SECTOR_SHIFT;
+ __bio_add_page(&bio, page, BTRFS_SUPER_INFO_SIZE, 0);
+ ret = submit_bio_wait(&bio);
+ bio_uninit(&bio);
+
+ if (ret < 0)
+ return ret;
+ ret = btrfs_check_super_csum(fs_info, sb);
+ if (ret != 0) {
+ btrfs_err_rl(fs_info,
+ "super block at physical %llu devid %llu has bad csum",
+ physical, dev->devid);
+ return -EIO;
+ }
+ if (btrfs_super_generation(sb) != generation) {
+ btrfs_err_rl(fs_info,
+"super block at physical %llu devid %llu has bad generation %llu expect %llu",
+ physical, dev->devid,
+ btrfs_super_generation(sb), generation);
+ return -EUCLEAN;
+ }
+
+ return btrfs_validate_super(fs_info, sb, -1);
+}
+
static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
struct btrfs_device *scrub_dev)
{
int i;
u64 bytenr;
u64 gen;
- int ret;
+ int ret = 0;
+ struct page *page;
struct btrfs_fs_info *fs_info = sctx->fs_info;
if (BTRFS_FS_ERROR(fs_info))
return -EROFS;
+ page = alloc_page(GFP_KERNEL);
+ if (!page) {
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.malloc_errors++;
+ spin_unlock(&sctx->stat_lock);
+ return -ENOMEM;
+ }
+
/* Seed devices of a new filesystem has their own generation. */
if (scrub_dev->fs_devices != fs_info->fs_devices)
gen = scrub_dev->generation;
if (!btrfs_check_super_location(scrub_dev, bytenr))
continue;
- ret = scrub_sectors(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
- scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
- NULL, bytenr);
- if (ret)
- return ret;
+ ret = scrub_one_super(sctx, scrub_dev, page, bytenr, gen);
+ if (ret) {
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.super_errors++;
+ spin_unlock(&sctx->stat_lock);
+ }
}
- wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
-
+ __free_page(page);
return 0;
}
return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
}
-
-static void scrub_find_good_copy(struct btrfs_fs_info *fs_info,
- u64 extent_logical, u32 extent_len,
- u64 *extent_physical,
- struct btrfs_device **extent_dev,
- int *extent_mirror_num)
-{
- u64 mapped_length;
- struct btrfs_io_context *bioc = NULL;
- int ret;
-
- mapped_length = extent_len;
- ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical,
- &mapped_length, &bioc, 0);
- if (ret || !bioc || mapped_length < extent_len ||
- !bioc->stripes[0].dev->bdev) {
- btrfs_put_bioc(bioc);
- return;
- }
-
- *extent_physical = bioc->stripes[0].physical;
- *extent_mirror_num = bioc->mirror_num;
- *extent_dev = bioc->stripes[0].dev;
- btrfs_put_bioc(bioc);
-}