btrfs: scrub: introduce a helper to verify one metadata block

[tomoyo/tomoyo-test1.git] / fs / btrfs / scrub.c
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c

index 69c93ae..034d2ad 100644 (file)
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -70,6 +70,94 @@ struct scrub_ctx;
   */
  #define BTRFS_MAX_MIRRORS (4 + 1)
  
+/* Represent one sector and its needed info to verify the content. */
+struct scrub_sector_verification {
+       bool is_metadata;
+
+       union {
+               /*
+                * Csum pointer for data csum verification.  Should point to a
+                * sector csum inside scrub_stripe::csums.
+                *
+                * NULL if this data sector has no csum.
+                */
+               u8 *csum;
+
+               /*
+                * Extra info for metadata verification.  All sectors inside a
+                * tree block share the same generation.
+                */
+               u64 generation;
+       };
+};
+
+enum scrub_stripe_flags {
+       /* Set when @mirror_num, @dev, @physical and @logical are set. */
+       SCRUB_STRIPE_FLAG_INITIALIZED,
+
+       /* Set when the read-repair is finished. */
+       SCRUB_STRIPE_FLAG_REPAIR_DONE,
+};
+
+#define SCRUB_STRIPE_PAGES             (BTRFS_STRIPE_LEN / PAGE_SIZE)
+
+/*
+ * Represent one contiguous range with a length of BTRFS_STRIPE_LEN.
+ */
+struct scrub_stripe {
+       struct btrfs_block_group *bg;
+
+       struct page *pages[SCRUB_STRIPE_PAGES];
+       struct scrub_sector_verification *sectors;
+
+       struct btrfs_device *dev;
+       u64 logical;
+       u64 physical;
+
+       u16 mirror_num;
+
+       /* Should be BTRFS_STRIPE_LEN / sectorsize. */
+       u16 nr_sectors;
+
+       atomic_t pending_io;
+       wait_queue_head_t io_wait;
+
+       /*
+        * Indicate the states of the stripe.  Bits are defined in
+        * scrub_stripe_flags enum.
+        */
+       unsigned long state;
+
+       /* Indicate which sectors are covered by extent items. */
+       unsigned long extent_sector_bitmap;
+
+       /*
+        * The errors hit during the initial read of the stripe.
+        *
+        * Would be utilized for error reporting and repair.
+        */
+       unsigned long init_error_bitmap;
+
+       /*
+        * The following error bitmaps are all for the current status.
+        * Every time we submit a new read, these bitmaps may be updated.
+        *
+        * error_bitmap = io_error_bitmap | csum_error_bitmap | meta_error_bitmap;
+        *
+        * IO and csum errors can happen for both metadata and data.
+        */
+       unsigned long error_bitmap;
+       unsigned long io_error_bitmap;
+       unsigned long csum_error_bitmap;
+       unsigned long meta_error_bitmap;
+
+       /*
+        * Checksum for the whole stripe if this stripe is inside a data block
+        * group.
+        */
+       u8 *csums;
+};
+
  struct scrub_recover {
         refcount_t              refs;
         struct btrfs_io_context *bioc;
@@ -266,6 +354,60 @@ static void detach_scrub_page_private(struct page *page)
  #endif
  }
  
+static void release_scrub_stripe(struct scrub_stripe *stripe)
+{
+       if (!stripe)
+               return;
+
+       for (int i = 0; i < SCRUB_STRIPE_PAGES; i++) {
+               if (stripe->pages[i])
+                       __free_page(stripe->pages[i]);
+               stripe->pages[i] = NULL;
+       }
+       kfree(stripe->sectors);
+       kfree(stripe->csums);
+       stripe->sectors = NULL;
+       stripe->csums = NULL;
+       stripe->state = 0;
+}
+
+int init_scrub_stripe(struct btrfs_fs_info *fs_info, struct scrub_stripe *stripe)
+{
+       int ret;
+
+       memset(stripe, 0, sizeof(*stripe));
+
+       stripe->nr_sectors = BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits;
+       stripe->state = 0;
+
+       init_waitqueue_head(&stripe->io_wait);
+       atomic_set(&stripe->pending_io, 0);
+
+       ret = btrfs_alloc_page_array(SCRUB_STRIPE_PAGES, stripe->pages);
+       if (ret < 0)
+               goto error;
+
+       stripe->sectors = kcalloc(stripe->nr_sectors,
+                                 sizeof(struct scrub_sector_verification),
+                                 GFP_KERNEL);
+       if (!stripe->sectors)
+               goto error;
+
+       stripe->csums = kcalloc(BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits,
+                               fs_info->csum_size, GFP_KERNEL);
+       if (!stripe->csums)
+               goto error;
+       return 0;
+error:
+       release_scrub_stripe(stripe);
+       return -ENOMEM;
+}
+
+void wait_scrub_stripe_io(struct scrub_stripe *stripe)
+{
+       wait_event(stripe->io_wait, atomic_read(&stripe->pending_io) == 0);
+}
+
  static struct scrub_block *alloc_scrub_block(struct scrub_ctx *sctx,
                                              struct btrfs_device *dev,
                                              u64 logical, u64 physical,
@@ -423,11 +565,6 @@ static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
  static void scrub_bio_end_io(struct bio *bio);
  static void scrub_bio_end_io_worker(struct work_struct *work);
  static void scrub_block_complete(struct scrub_block *sblock);
-static void scrub_find_good_copy(struct btrfs_fs_info *fs_info,
-                                u64 extent_logical, u32 extent_len,
-                                u64 *extent_physical,
-                                struct btrfs_device **extent_dev,
-                                int *extent_mirror_num);
  static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx,
                                       struct scrub_sector *sector);
  static void scrub_wr_submit(struct scrub_ctx *sctx);
@@ -1230,7 +1367,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
                         sblock_other = sblocks_for_recheck[mirror_index];
                 } else {
                         struct scrub_recover *r = sblock_bad->sectors[0]->recover;
-                       int max_allowed = r->bioc->num_stripes - r->bioc->num_tgtdevs;
+                       int max_allowed = r->bioc->num_stripes - r->bioc->replace_nr_stripes;
  
                         if (mirror_index >= max_allowed)
                                 break;
@@ -1430,7 +1567,7 @@ static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc)
  }
  
  static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
-                                                u64 *raid_map,
+                                                u64 full_stripe_logical,
                                                  int nstripes, int mirror,
                                                  int *stripe_index,
                                                  u64 *stripe_offset)
@@ -1438,19 +1575,22 @@ static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
         int i;
  
         if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+               const int nr_data_stripes = (map_type & BTRFS_BLOCK_GROUP_RAID5) ?
+                                           nstripes - 1 : nstripes - 2;
+
                 /* RAID5/6 */
-               for (i = 0; i < nstripes; i++) {
-                       if (raid_map[i] == RAID6_Q_STRIPE ||
-                           raid_map[i] == RAID5_P_STRIPE)
-                               continue;
+               for (i = 0; i < nr_data_stripes; i++) {
+                       const u64 data_stripe_start = full_stripe_logical +
+                                               (i * BTRFS_STRIPE_LEN);
  
-                       if (logical >= raid_map[i] &&
-                           logical < raid_map[i] + BTRFS_STRIPE_LEN)
+                       if (logical >= data_stripe_start &&
+                           logical < data_stripe_start + BTRFS_STRIPE_LEN)
                                 break;
                 }
  
                 *stripe_index = i;
-               *stripe_offset = logical - raid_map[i];
+               *stripe_offset = (logical - full_stripe_logical) &
+                                BTRFS_STRIPE_LEN_MASK;
         } else {
                 /* The other RAID type */
                 *stripe_index = mirror;
@@ -1538,9 +1678,9 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
  
                         scrub_stripe_index_and_offset(logical,
                                                       bioc->map_type,
-                                                     bioc->raid_map,
+                                                     bioc->full_stripe_logical,
                                                       bioc->num_stripes -
-                                                     bioc->num_tgtdevs,
+                                                     bioc->replace_nr_stripes,
                                                       mirror_index,
                                                       &stripe_index,
                                                       &stripe_offset);
@@ -2019,6 +2159,112 @@ static int scrub_checksum_data(struct scrub_block *sblock)
         return sblock->checksum_error;
  }
  
+static struct page *scrub_stripe_get_page(struct scrub_stripe *stripe, int sector_nr)
+{
+       struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
+       int page_index = (sector_nr << fs_info->sectorsize_bits) >> PAGE_SHIFT;
+
+       return stripe->pages[page_index];
+}
+
+static unsigned int scrub_stripe_get_page_offset(struct scrub_stripe *stripe,
+                                                int sector_nr)
+{
+       struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
+
+       return offset_in_page(sector_nr << fs_info->sectorsize_bits);
+}
+
+void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr)
+{
+       struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
+       const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits;
+       const u64 logical = stripe->logical + (sector_nr << fs_info->sectorsize_bits);
+       const struct page *first_page = scrub_stripe_get_page(stripe, sector_nr);
+       const unsigned int first_off = scrub_stripe_get_page_offset(stripe, sector_nr);
+       SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
+       u8 on_disk_csum[BTRFS_CSUM_SIZE];
+       u8 calculated_csum[BTRFS_CSUM_SIZE];
+       struct btrfs_header *header;
+
+       /*
+        * Here we don't have a good way to attach the pages (and subpages)
+        * to a dummy extent buffer, thus we have to directly grab the members
+        * from pages.
+        */
+       header = (struct btrfs_header *)(page_address(first_page) + first_off);
+       memcpy(on_disk_csum, header->csum, fs_info->csum_size);
+
+       if (logical != btrfs_stack_header_bytenr(header)) {
+               bitmap_set(&stripe->csum_error_bitmap, sector_nr, sectors_per_tree);
+               bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
+               btrfs_warn_rl(fs_info,
+               "tree block %llu mirror %u has bad bytenr, has %llu want %llu",
+                             logical, stripe->mirror_num,
+                             btrfs_stack_header_bytenr(header), logical);
+               return;
+       }
+       if (memcmp(header->fsid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE) != 0) {
+               bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
+               bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
+               btrfs_warn_rl(fs_info,
+               "tree block %llu mirror %u has bad fsid, has %pU want %pU",
+                             logical, stripe->mirror_num,
+                             header->fsid, fs_info->fs_devices->fsid);
+               return;
+       }
+       if (memcmp(header->chunk_tree_uuid, fs_info->chunk_tree_uuid,
+                  BTRFS_UUID_SIZE) != 0) {
+               bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
+               bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
+               btrfs_warn_rl(fs_info,
+               "tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU",
+                             logical, stripe->mirror_num,
+                             header->chunk_tree_uuid, fs_info->chunk_tree_uuid);
+               return;
+       }
+
+       /* Now check tree block csum. */
+       shash->tfm = fs_info->csum_shash;
+       crypto_shash_init(shash);
+       crypto_shash_update(shash, page_address(first_page) + first_off +
+                           BTRFS_CSUM_SIZE, fs_info->sectorsize - BTRFS_CSUM_SIZE);
+
+       for (int i = sector_nr + 1; i < sector_nr + sectors_per_tree; i++) {
+               struct page *page = scrub_stripe_get_page(stripe, i);
+               unsigned int page_off = scrub_stripe_get_page_offset(stripe, i);
+
+               crypto_shash_update(shash, page_address(page) + page_off,
+                                   fs_info->sectorsize);
+       }
+
+       crypto_shash_final(shash, calculated_csum);
+       if (memcmp(calculated_csum, on_disk_csum, fs_info->csum_size) != 0) {
+               bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
+               bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
+               btrfs_warn_rl(fs_info,
+               "tree block %llu mirror %u has bad csum, has " CSUM_FMT " want " CSUM_FMT,
+                             logical, stripe->mirror_num,
+                             CSUM_FMT_VALUE(fs_info->csum_size, on_disk_csum),
+                             CSUM_FMT_VALUE(fs_info->csum_size, calculated_csum));
+               return;
+       }
+       if (stripe->sectors[sector_nr].generation !=
+           btrfs_stack_header_generation(header)) {
+               bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
+               bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
+               btrfs_warn_rl(fs_info,
+               "tree block %llu mirror %u has bad generation, has %llu want %llu",
+                             logical, stripe->mirror_num,
+                             btrfs_stack_header_generation(header),
+                             stripe->sectors[sector_nr].generation);
+               return;
+       }
+       bitmap_clear(&stripe->error_bitmap, sector_nr, sectors_per_tree);
+       bitmap_clear(&stripe->csum_error_bitmap, sector_nr, sectors_per_tree);
+       bitmap_clear(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
+}
+
  static int scrub_checksum_tree_block(struct scrub_block *sblock)
  {
         struct scrub_ctx *sctx = sblock->sctx;
@@ -2398,7 +2644,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
         btrfs_bio_counter_inc_blocked(fs_info);
         ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
                                &length, &bioc);
-       if (ret || !bioc || !bioc->raid_map)
+       if (ret || !bioc)
                 goto bioc_out;
  
         if (WARN_ON(!sctx->is_dev_replace ||
@@ -2707,6 +2953,110 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
         return 1;
  }
  
+static bool should_use_device(struct btrfs_fs_info *fs_info,
+                             struct btrfs_device *dev,
+                             bool follow_replace_read_mode)
+{
+       struct btrfs_device *replace_srcdev = fs_info->dev_replace.srcdev;
+       struct btrfs_device *replace_tgtdev = fs_info->dev_replace.tgtdev;
+
+       if (!dev->bdev)
+               return false;
+
+       /*
+        * We're doing scrub/replace, if it's pure scrub, no tgtdev should be
+        * here.  If it's replace, we're going to write data to tgtdev, thus
+        * the current data of the tgtdev is all garbage, thus we can not use
+        * it at all.
+        */
+       if (dev == replace_tgtdev)
+               return false;
+
+       /* No need to follow replace read mode, any existing device is fine. */
+       if (!follow_replace_read_mode)
+               return true;
+
+       /* Need to follow the mode. */
+       if (fs_info->dev_replace.cont_reading_from_srcdev_mode ==
+           BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
+               return dev != replace_srcdev;
+       return true;
+}
+static int scrub_find_good_copy(struct btrfs_fs_info *fs_info,
+                               u64 extent_logical, u32 extent_len,
+                               u64 *extent_physical,
+                               struct btrfs_device **extent_dev,
+                               int *extent_mirror_num)
+{
+       u64 mapped_length;
+       struct btrfs_io_context *bioc = NULL;
+       int ret;
+       int i;
+
+       mapped_length = extent_len;
+       ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
+                             extent_logical, &mapped_length, &bioc, 0);
+       if (ret || !bioc || mapped_length < extent_len) {
+               btrfs_put_bioc(bioc);
+               btrfs_err_rl(fs_info, "btrfs_map_block() failed for logical %llu: %d",
+                               extent_logical, ret);
+               return -EIO;
+       }
+
+       /*
+        * First loop to exclude all missing devices and the source device if
+        * needed.  And we don't want to use target device as mirror either, as
+        * we're doing the replace, the target device range contains nothing.
+        */
+       for (i = 0; i < bioc->num_stripes - bioc->replace_nr_stripes; i++) {
+               struct btrfs_io_stripe *stripe = &bioc->stripes[i];
+
+               if (!should_use_device(fs_info, stripe->dev, true))
+                       continue;
+               goto found;
+       }
+       /*
+        * We didn't find any alternative mirrors, we have to break our replace
+        * read mode, or we can not read at all.
+        */
+       for (i = 0; i < bioc->num_stripes - bioc->replace_nr_stripes; i++) {
+               struct btrfs_io_stripe *stripe = &bioc->stripes[i];
+
+               if (!should_use_device(fs_info, stripe->dev, false))
+                       continue;
+               goto found;
+       }
+
+       btrfs_err_rl(fs_info, "failed to find any live mirror for logical %llu",
+                       extent_logical);
+       return -EIO;
+
+found:
+       *extent_physical = bioc->stripes[i].physical;
+       *extent_mirror_num = i + 1;
+       *extent_dev = bioc->stripes[i].dev;
+       btrfs_put_bioc(bioc);
+       return 0;
+}
+
+static bool scrub_need_different_mirror(struct scrub_ctx *sctx,
+                                       struct map_lookup *map,
+                                       struct btrfs_device *dev)
+{
+       /*
+        * For RAID56, all the extra mirrors are rebuilt from other P/Q,
+        * cannot utilize other mirrors directly.
+        */
+       if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
+               return false;
+
+       if (!dev->bdev)
+               return true;
+
+       return sctx->fs_info->dev_replace.cont_reading_from_srcdev_mode ==
+               BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID;
+}
+
  /* scrub extent tries to collect up to 64 kB for each bio */
  static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
                         u64 logical, u32 len,
@@ -2722,7 +3072,7 @@ static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
  
         if (flags & BTRFS_EXTENT_FLAG_DATA) {
                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
-                       blocksize = map->stripe_len;
+                       blocksize = BTRFS_STRIPE_LEN;
                 else
                         blocksize = sctx->fs_info->sectorsize;
                 spin_lock(&sctx->stat_lock);
@@ -2731,7 +3081,7 @@ static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
                 spin_unlock(&sctx->stat_lock);
         } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
-                       blocksize = map->stripe_len;
+                       blocksize = BTRFS_STRIPE_LEN;
                 else
                         blocksize = sctx->fs_info->nodesize;
                 spin_lock(&sctx->stat_lock);
@@ -2744,17 +3094,15 @@ static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
         }
  
         /*
-        * For dev-replace case, we can have @dev being a missing device.
-        * Regular scrub will avoid its execution on missing device at all,
-        * as that would trigger tons of read error.
-        *
-        * Reading from missing device will cause read error counts to
-        * increase unnecessarily.
-        * So here we change the read source to a good mirror.
+        * For dev-replace case, we can have @dev being a missing device, or
+        * we want to avoid reading from the source device if possible.
          */
-       if (sctx->is_dev_replace && !dev->bdev)
-               scrub_find_good_copy(sctx->fs_info, logical, len, &src_physical,
-                                    &src_dev, &src_mirror);
+       if (sctx->is_dev_replace && scrub_need_different_mirror(sctx, map, dev)) {
+               ret = scrub_find_good_copy(sctx->fs_info, logical, len,
+                                          &src_physical, &src_dev, &src_mirror);
+               if (ret < 0)
+                       return ret;
+       }
         while (len) {
                 u32 l = min(len, blocksize);
                 int have_csum = 0;
@@ -2908,10 +3256,7 @@ static int get_raid56_logic_offset(u64 physical, int num,
  {
         int i;
         int j = 0;
-       u64 stripe_nr;
         u64 last_offset;
-       u32 stripe_index;
-       u32 rot;
         const int data_stripes = nr_data_stripes(map);
  
         last_offset = (physical - map->stripes[num].physical) * data_stripes;
@@ -2920,13 +3265,17 @@ static int get_raid56_logic_offset(u64 physical, int num,
  
         *offset = last_offset;
         for (i = 0; i < data_stripes; i++) {
-               *offset = last_offset + i * map->stripe_len;
+               u32 stripe_nr;
+               u32 stripe_index;
+               u32 rot;
+
+               *offset = last_offset + (i << BTRFS_STRIPE_LEN_SHIFT);
  
-               stripe_nr = div64_u64(*offset, map->stripe_len);
-               stripe_nr = div_u64(stripe_nr, data_stripes);
+               stripe_nr = (u32)(*offset >> BTRFS_STRIPE_LEN_SHIFT) / data_stripes;
  
                 /* Work out the disk rotation on this stripe-set */
-               stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
+               rot = stripe_nr % map->num_stripes;
+               stripe_nr /= map->num_stripes;
                 /* calculate which stripe this data locates */
                 rot += i;
                 stripe_index = rot % map->num_stripes;
@@ -2935,7 +3284,7 @@ static int get_raid56_logic_offset(u64 physical, int num,
                 if (stripe_index < num)
                         j++;
         }
-       *offset = last_offset + j * map->stripe_len;
+       *offset = last_offset + (j << BTRFS_STRIPE_LEN_SHIFT);
         return 1;
  }
  
@@ -3006,7 +3355,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
         btrfs_bio_counter_inc_blocked(fs_info);
         ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
                                &length, &bioc);
-       if (ret || !bioc || !bioc->raid_map)
+       if (ret || !bioc)
                 goto bioc_out;
  
         bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
@@ -3205,7 +3554,7 @@ static int scrub_raid56_data_stripe_for_parity(struct scrub_ctx *sctx,
         /* Path must not be populated */
         ASSERT(!path->nodes[0]);
  
-       while (cur_logical < logical + map->stripe_len) {
+       while (cur_logical < logical + BTRFS_STRIPE_LEN) {
                 struct btrfs_io_context *bioc = NULL;
                 struct btrfs_device *extent_dev;
                 u64 extent_start;
@@ -3217,7 +3566,7 @@ static int scrub_raid56_data_stripe_for_parity(struct scrub_ctx *sctx,
                 u64 extent_mirror_num;
  
                 ret = find_first_extent_item(extent_root, path, cur_logical,
-                                            logical + map->stripe_len - cur_logical);
+                                            logical + BTRFS_STRIPE_LEN - cur_logical);
                 /* No more extent item in this data stripe */
                 if (ret > 0) {
                         ret = 0;
@@ -3231,7 +3580,7 @@ static int scrub_raid56_data_stripe_for_parity(struct scrub_ctx *sctx,
                 /* Metadata should not cross stripe boundaries */
                 if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
                     does_range_cross_boundary(extent_start, extent_size,
-                                             logical, map->stripe_len)) {
+                                             logical, BTRFS_STRIPE_LEN)) {
                         btrfs_err(fs_info,
         "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
                                   extent_start, logical);
@@ -3247,7 +3596,7 @@ static int scrub_raid56_data_stripe_for_parity(struct scrub_ctx *sctx,
  
                 /* Truncate the range inside this data stripe */
                 extent_size = min(extent_start + extent_size,
-                                 logical + map->stripe_len) - cur_logical;
+                                 logical + BTRFS_STRIPE_LEN) - cur_logical;
                 extent_start = cur_logical;
                 ASSERT(extent_size <= U32_MAX);
  
@@ -3320,8 +3669,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
         path->search_commit_root = 1;
         path->skip_locking = 1;
  
-       ASSERT(map->stripe_len <= U32_MAX);
-       nsectors = map->stripe_len >> fs_info->sectorsize_bits;
+       nsectors = BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits;
         ASSERT(nsectors <= BITS_PER_LONG);
         sparity = kzalloc(sizeof(struct scrub_parity), GFP_NOFS);
         if (!sparity) {
@@ -3332,8 +3680,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
                 return -ENOMEM;
         }
  
-       ASSERT(map->stripe_len <= U32_MAX);
-       sparity->stripe_len = map->stripe_len;
+       sparity->stripe_len = BTRFS_STRIPE_LEN;
         sparity->nsectors = nsectors;
         sparity->sctx = sctx;
         sparity->scrub_dev = sdev;
@@ -3344,7 +3691,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
  
         ret = 0;
         for (cur_logical = logic_start; cur_logical < logic_end;
-            cur_logical += map->stripe_len) {
+            cur_logical += BTRFS_STRIPE_LEN) {
                 ret = scrub_raid56_data_stripe_for_parity(sctx, sparity, map,
                                                           sdev, path, cur_logical);
                 if (ret < 0)
@@ -3401,6 +3748,149 @@ static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical,
         return ret;
  }
  
+static void fill_one_extent_info(struct btrfs_fs_info *fs_info,
+                                struct scrub_stripe *stripe,
+                                u64 extent_start, u64 extent_len,
+                                u64 extent_flags, u64 extent_gen)
+{
+       for (u64 cur_logical = max(stripe->logical, extent_start);
+            cur_logical < min(stripe->logical + BTRFS_STRIPE_LEN,
+                              extent_start + extent_len);
+            cur_logical += fs_info->sectorsize) {
+               const int nr_sector = (cur_logical - stripe->logical) >>
+                                     fs_info->sectorsize_bits;
+               struct scrub_sector_verification *sector =
+                                               &stripe->sectors[nr_sector];
+
+               set_bit(nr_sector, &stripe->extent_sector_bitmap);
+               if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+                       sector->is_metadata = true;
+                       sector->generation = extent_gen;
+               }
+       }
+}
+
+static void scrub_stripe_reset_bitmaps(struct scrub_stripe *stripe)
+{
+       stripe->extent_sector_bitmap = 0;
+       stripe->init_error_bitmap = 0;
+       stripe->error_bitmap = 0;
+       stripe->io_error_bitmap = 0;
+       stripe->csum_error_bitmap = 0;
+       stripe->meta_error_bitmap = 0;
+}
+
+/*
+ * Locate one stripe which has at least one extent in its range.
+ *
+ * Return 0 if found such stripe, and store its info into @stripe.
+ * Return >0 if there is no such stripe in the specified range.
+ * Return <0 for error.
+ */
+int scrub_find_fill_first_stripe(struct btrfs_block_group *bg,
+                                struct btrfs_device *dev, u64 physical,
+                                int mirror_num, u64 logical_start,
+                                u32 logical_len, struct scrub_stripe *stripe)
+{
+       struct btrfs_fs_info *fs_info = bg->fs_info;
+       struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bg->start);
+       struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bg->start);
+       const u64 logical_end = logical_start + logical_len;
+       struct btrfs_path path = { 0 };
+       u64 cur_logical = logical_start;
+       u64 stripe_end;
+       u64 extent_start;
+       u64 extent_len;
+       u64 extent_flags;
+       u64 extent_gen;
+       int ret;
+
+       memset(stripe->sectors, 0, sizeof(struct scrub_sector_verification) *
+                                  stripe->nr_sectors);
+       scrub_stripe_reset_bitmaps(stripe);
+
+       /* The range must be inside the bg. */
+       ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length);
+
+       path.search_commit_root = 1;
+       path.skip_locking = 1;
+
+       ret = find_first_extent_item(extent_root, &path, logical_start, logical_len);
+       /* Either error or not found. */
+       if (ret)
+               goto out;
+       get_extent_info(&path, &extent_start, &extent_len, &extent_flags, &extent_gen);
+       cur_logical = max(extent_start, cur_logical);
+
+       /*
+        * Round down to stripe boundary.
+        *
+        * The extra calculation against bg->start is to handle block groups
+        * whose logical bytenr is not BTRFS_STRIPE_LEN aligned.
+        */
+       stripe->logical = round_down(cur_logical - bg->start, BTRFS_STRIPE_LEN) +
+                         bg->start;
+       stripe->physical = physical + stripe->logical - logical_start;
+       stripe->dev = dev;
+       stripe->bg = bg;
+       stripe->mirror_num = mirror_num;
+       stripe_end = stripe->logical + BTRFS_STRIPE_LEN - 1;
+
+       /* Fill the first extent info into stripe->sectors[] array. */
+       fill_one_extent_info(fs_info, stripe, extent_start, extent_len,
+                            extent_flags, extent_gen);
+       cur_logical = extent_start + extent_len;
+
+       /* Fill the extent info for the remaining sectors. */
+       while (cur_logical <= stripe_end) {
+               ret = find_first_extent_item(extent_root, &path, cur_logical,
+                                            stripe_end - cur_logical + 1);
+               if (ret < 0)
+                       goto out;
+               if (ret > 0) {
+                       ret = 0;
+                       break;
+               }
+               get_extent_info(&path, &extent_start, &extent_len,
+                               &extent_flags, &extent_gen);
+               fill_one_extent_info(fs_info, stripe, extent_start, extent_len,
+                                    extent_flags, extent_gen);
+               cur_logical = extent_start + extent_len;
+       }
+
+       /* Now fill the data csum. */
+       if (bg->flags & BTRFS_BLOCK_GROUP_DATA) {
+               int sector_nr;
+               unsigned long csum_bitmap = 0;
+
+               /* Csum space should have already been allocated. */
+               ASSERT(stripe->csums);
+
+               /*
+                * Our csum bitmap should be large enough, as BTRFS_STRIPE_LEN
+                * should contain at most 16 sectors.
+                */
+               ASSERT(BITS_PER_LONG >= BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits);
+
+               ret = btrfs_lookup_csums_bitmap(csum_root, stripe->logical,
+                                               stripe_end, stripe->csums,
+                                               &csum_bitmap, true);
+               if (ret < 0)
+                       goto out;
+               if (ret > 0)
+                       ret = 0;
+
+               for_each_set_bit(sector_nr, &csum_bitmap, stripe->nr_sectors) {
+                       stripe->sectors[sector_nr].csum = stripe->csums +
+                               sector_nr * fs_info->csum_size;
+               }
+       }
+       set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state);
+out:
+       btrfs_release_path(&path);
+       return ret;
+}
+
  /*
   * Scrub one range which can only has simple mirror based profile.
   * (Including all range in SINGLE/DUP/RAID1/RAID1C*, and each stripe in
@@ -3410,8 +3900,6 @@ static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical,
   * and @logical_length parameter.
   */
  static int scrub_simple_mirror(struct scrub_ctx *sctx,
-                              struct btrfs_root *extent_root,
-                              struct btrfs_root *csum_root,
                                struct btrfs_block_group *bg,
                                struct map_lookup *map,
                                u64 logical_start, u64 logical_length,
@@ -3419,6 +3907,8 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
                                u64 physical, int mirror_num)
  {
         struct btrfs_fs_info *fs_info = sctx->fs_info;
+       struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bg->start);
+       struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bg->start);
         const u64 logical_end = logical_start + logical_length;
         /* An artificial limit, inherit from old scrub behavior */
         const u32 max_length = SZ_64K;
@@ -3536,7 +4026,7 @@ static u64 simple_stripe_full_stripe_len(const struct map_lookup *map)
         ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
                             BTRFS_BLOCK_GROUP_RAID10));
  
-       return map->num_stripes / map->sub_stripes * map->stripe_len;
+       return (map->num_stripes / map->sub_stripes) << BTRFS_STRIPE_LEN_SHIFT;
  }
  
  /* Get the logical bytenr for the stripe */
@@ -3552,7 +4042,8 @@ static u64 simple_stripe_get_logical(struct map_lookup *map,
          * (stripe_index / sub_stripes) gives how many data stripes we need to
          * skip.
          */
-       return (stripe_index / map->sub_stripes) * map->stripe_len + bg->start;
+       return ((stripe_index / map->sub_stripes) << BTRFS_STRIPE_LEN_SHIFT) +
+              bg->start;
  }
  
  /* Get the mirror number for the stripe */
@@ -3567,8 +4058,6 @@ static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index)
  }
  
  static int scrub_simple_stripe(struct scrub_ctx *sctx,
-                              struct btrfs_root *extent_root,
-                              struct btrfs_root *csum_root,
                                struct btrfs_block_group *bg,
                                struct map_lookup *map,
                                struct btrfs_device *device,
@@ -3588,15 +4077,15 @@ static int scrub_simple_stripe(struct scrub_ctx *sctx,
                  * just RAID1, so we can reuse scrub_simple_mirror() to scrub
                  * this stripe.
                  */
-               ret = scrub_simple_mirror(sctx, extent_root, csum_root, bg, map,
-                                         cur_logical, map->stripe_len, device,
-                                         cur_physical, mirror_num);
+               ret = scrub_simple_mirror(sctx, bg, map, cur_logical,
+                                         BTRFS_STRIPE_LEN, device, cur_physical,
+                                         mirror_num);
                 if (ret)
                         return ret;
                 /* Skip to next stripe which belongs to the target device */
                 cur_logical += logical_increment;
                 /* For physical offset, we just go to next stripe */
-               cur_physical += map->stripe_len;
+               cur_physical += BTRFS_STRIPE_LEN;
         }
         return ret;
  }
@@ -3607,10 +4096,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
                                            struct btrfs_device *scrub_dev,
                                            int stripe_index)
  {
-       struct btrfs_path *path;
         struct btrfs_fs_info *fs_info = sctx->fs_info;
-       struct btrfs_root *root;
-       struct btrfs_root *csum_root;
         struct blk_plug plug;
         struct map_lookup *map = em->map_lookup;
         const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
@@ -3629,26 +4115,10 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
         u64 stripe_end;
         int stop_loop = 0;
  
-       path = btrfs_alloc_path();
-       if (!path)
-               return -ENOMEM;
-
-       /*
-        * work on commit root. The related disk blocks are static as
-        * long as COW is applied. This means, it is save to rewrite
-        * them to repair disk errors without any race conditions
-        */
-       path->search_commit_root = 1;
-       path->skip_locking = 1;
-       path->reada = READA_FORWARD;
-
         wait_event(sctx->list_wait,
                    atomic_read(&sctx->bios_in_flight) == 0);
         scrub_blocked_if_needed(fs_info);
  
-       root = btrfs_extent_root(fs_info, bg->start);
-       csum_root = btrfs_csum_root(fs_info, bg->start);
-
         /*
          * collect all data csums for the stripe to avoid seeking during
          * the scrub. This might currently (crc32) end up to be about 1MB
@@ -3680,17 +4150,15 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
                  * Only @physical and @mirror_num needs to calculated using
                  * @stripe_index.
                  */
-               ret = scrub_simple_mirror(sctx, root, csum_root, bg, map,
-                               bg->start, bg->length, scrub_dev,
-                               map->stripes[stripe_index].physical,
+               ret = scrub_simple_mirror(sctx, bg, map, bg->start, bg->length,
+                               scrub_dev, map->stripes[stripe_index].physical,
                                 stripe_index + 1);
                 offset = 0;
                 goto out;
         }
         if (profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
-               ret = scrub_simple_stripe(sctx, root, csum_root, bg, map,
-                                         scrub_dev, stripe_index);
-               offset = map->stripe_len * (stripe_index / map->sub_stripes);
+               ret = scrub_simple_stripe(sctx, bg, map, scrub_dev, stripe_index);
+               offset = (stripe_index / map->sub_stripes) << BTRFS_STRIPE_LEN_SHIFT;
                 goto out;
         }
  
@@ -3705,7 +4173,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
  
         /* Initialize @offset in case we need to go to out: label */
         get_raid56_logic_offset(physical, stripe_index, map, &offset, NULL);
-       increment = map->stripe_len * nr_data_stripes(map);
+       increment = nr_data_stripes(map) << BTRFS_STRIPE_LEN_SHIFT;
  
         /*
          * Due to the rotation, for RAID56 it's better to iterate each stripe
@@ -3735,14 +4203,13 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
                  * We can reuse scrub_simple_mirror() here, as the repair part
                  * is still based on @mirror_num.
                  */
-               ret = scrub_simple_mirror(sctx, root, csum_root, bg, map,
-                                         logical, map->stripe_len,
+               ret = scrub_simple_mirror(sctx, bg, map, logical, BTRFS_STRIPE_LEN,
                                           scrub_dev, physical, 1);
                 if (ret < 0)
                         goto out;
  next:
                 logical += increment;
-               physical += map->stripe_len;
+               physical += BTRFS_STRIPE_LEN;
                 spin_lock(&sctx->stat_lock);
                 if (stop_loop)
                         sctx->stat.last_physical =
@@ -3761,7 +4228,6 @@ out:
         mutex_unlock(&sctx->wr_lock);
  
         blk_finish_plug(&plug);
-       btrfs_free_path(path);
  
         if (sctx->is_dev_replace && ret >= 0) {
                 int ret2;
@@ -4168,18 +4634,62 @@ skip:
         return ret;
  }
  
+static int scrub_one_super(struct scrub_ctx *sctx, struct btrfs_device *dev,
+                          struct page *page, u64 physical, u64 generation)
+{
+       struct btrfs_fs_info *fs_info = sctx->fs_info;
+       struct bio_vec bvec;
+       struct bio bio;
+       struct btrfs_super_block *sb = page_address(page);
+       int ret;
+
+       bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_READ);
+       bio.bi_iter.bi_sector = physical >> SECTOR_SHIFT;
+       __bio_add_page(&bio, page, BTRFS_SUPER_INFO_SIZE, 0);
+       ret = submit_bio_wait(&bio);
+       bio_uninit(&bio);
+
+       if (ret < 0)
+               return ret;
+       ret = btrfs_check_super_csum(fs_info, sb);
+       if (ret != 0) {
+               btrfs_err_rl(fs_info,
+                       "super block at physical %llu devid %llu has bad csum",
+                       physical, dev->devid);
+               return -EIO;
+       }
+       if (btrfs_super_generation(sb) != generation) {
+               btrfs_err_rl(fs_info,
+"super block at physical %llu devid %llu has bad generation %llu expect %llu",
+                            physical, dev->devid,
+                            btrfs_super_generation(sb), generation);
+               return -EUCLEAN;
+       }
+
+       return btrfs_validate_super(fs_info, sb, -1);
+}
+
  static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
                                            struct btrfs_device *scrub_dev)
  {
         int     i;
         u64     bytenr;
         u64     gen;
-       int     ret;
+       int ret = 0;
+       struct page *page;
         struct btrfs_fs_info *fs_info = sctx->fs_info;
  
         if (BTRFS_FS_ERROR(fs_info))
                 return -EROFS;
  
+       page = alloc_page(GFP_KERNEL);
+       if (!page) {
+               spin_lock(&sctx->stat_lock);
+               sctx->stat.malloc_errors++;
+               spin_unlock(&sctx->stat_lock);
+               return -ENOMEM;
+       }
+
         /* Seed devices of a new filesystem has their own generation. */
         if (scrub_dev->fs_devices != fs_info->fs_devices)
                 gen = scrub_dev->generation;
@@ -4194,14 +4704,14 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
                 if (!btrfs_check_super_location(scrub_dev, bytenr))
                         continue;
  
-               ret = scrub_sectors(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
-                                   scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
-                                   NULL, bytenr);
-               if (ret)
-                       return ret;
+               ret = scrub_one_super(sctx, scrub_dev, page, bytenr, gen);
+               if (ret) {
+                       spin_lock(&sctx->stat_lock);
+                       sctx->stat.super_errors++;
+                       spin_unlock(&sctx->stat_lock);
+               }
         }
-       wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
-
+       __free_page(page);
         return 0;
  }
  
@@ -4541,28 +5051,3 @@ int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
  
         return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
  }
-
-static void scrub_find_good_copy(struct btrfs_fs_info *fs_info,
-                                u64 extent_logical, u32 extent_len,
-                                u64 *extent_physical,
-                                struct btrfs_device **extent_dev,
-                                int *extent_mirror_num)
-{
-       u64 mapped_length;
-       struct btrfs_io_context *bioc = NULL;
-       int ret;
-
-       mapped_length = extent_len;
-       ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical,
-                             &mapped_length, &bioc, 0);
-       if (ret || !bioc || mapped_length < extent_len ||
-           !bioc->stripes[0].dev->bdev) {
-               btrfs_put_bioc(bioc);
-               return;
-       }
-
-       *extent_physical = bioc->stripes[0].physical;
-       *extent_mirror_num = bioc->mirror_num;
-       *extent_dev = bioc->stripes[0].dev;
-       btrfs_put_bioc(bioc);
-}