OSDN Git Service

Merge tag 'dm-4.2-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device...
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 25 Jun 2015 23:34:39 +0000 (16:34 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 25 Jun 2015 23:34:39 +0000 (16:34 -0700)
Pull device mapper updates from Mike Snitzer:

 - DM core cleanups:

     * blk-mq request-based DM no longer uses any mempools now that
       partial completions are no longer handled as part of cloned
       requests

 - DM raid cleanups and support for MD raid0

 - DM cache core advances and a new stochastic-multi-queue (smq) cache
   replacement policy

     * smq is the new default dm-cache policy

 - DM thinp cleanups and much more efficient large discard support

 - DM statistics support for request-based DM and nanosecond resolution
   timestamps

 - Fixes to DM stripe, DM log-writes, DM raid1 and DM crypt

* tag 'dm-4.2-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (39 commits)
  dm stats: add support for request-based DM devices
  dm stats: collect and report histogram of IO latencies
  dm stats: support precise timestamps
  dm stats: fix divide by zero if 'number_of_areas' arg is zero
  dm cache: switch the "default" cache replacement policy from mq to smq
  dm space map metadata: fix occasional leak of a metadata block on resize
  dm thin metadata: fix a race when entering fail mode
  dm thin: fail messages with EOPNOTSUPP when pool cannot handle messages
  dm thin: range discard support
  dm thin metadata: add dm_thin_remove_range()
  dm thin metadata: add dm_thin_find_mapped_range()
  dm btree: add dm_btree_remove_leaves()
  dm stats: Use kvfree() in dm_kvfree()
  dm cache: age and write back cache entries even without active IO
  dm cache: prefix all DMERR and DMINFO messages with cache device name
  dm cache: add fail io mode and needs_check flag
  dm cache: wake the worker thread every time we free a migration object
  dm cache: add stochastic-multi-queue (smq) policy
  dm cache: boost promotion of blocks that will be overwritten
  dm cache: defer whole cells
  ...

33 files changed:
Documentation/device-mapper/cache-policies.txt
Documentation/device-mapper/cache.txt
Documentation/device-mapper/dm-raid.txt
Documentation/device-mapper/statistics.txt
drivers/md/Kconfig
drivers/md/Makefile
drivers/md/dm-bio-prison.c
drivers/md/dm-bio-prison.h
drivers/md/dm-cache-metadata.c
drivers/md/dm-cache-metadata.h
drivers/md/dm-cache-policy-cleaner.c
drivers/md/dm-cache-policy-internal.h
drivers/md/dm-cache-policy-mq.c
drivers/md/dm-cache-policy-smq.c [new file with mode: 0644]
drivers/md/dm-cache-policy.h
drivers/md/dm-cache-target.c
drivers/md/dm-crypt.c
drivers/md/dm-log-writes.c
drivers/md/dm-raid.c
drivers/md/dm-raid1.c
drivers/md/dm-stats.c
drivers/md/dm-stats.h
drivers/md/dm-stripe.c
drivers/md/dm-table.c
drivers/md/dm-thin-metadata.c
drivers/md/dm-thin-metadata.h
drivers/md/dm-thin.c
drivers/md/dm.c
drivers/md/persistent-data/dm-block-manager.c
drivers/md/persistent-data/dm-block-manager.h
drivers/md/persistent-data/dm-btree-remove.c
drivers/md/persistent-data/dm-btree.h
drivers/md/persistent-data/dm-space-map-metadata.c

index 0d124a9..d9246a3 100644 (file)
@@ -25,10 +25,10 @@ trying to see when the io scheduler has let the ios run.
 Overview of supplied cache replacement policies
 ===============================================
 
-multiqueue
-----------
+multiqueue (mq)
+---------------
 
-This policy is the default.
+This policy has been deprecated in favor of the smq policy (see below).
 
 The multiqueue policy has three sets of 16 queues: one set for entries
 waiting for the cache and another two for those in the cache (a set for
@@ -73,6 +73,67 @@ If you're trying to quickly warm a new cache device you may wish to
 reduce these to encourage promotion.  Remember to switch them back to
 their defaults after the cache fills though.
 
+Stochastic multiqueue (smq)
+---------------------------
+
+This policy is the default.
+
+The stochastic multi-queue (smq) policy addresses some of the problems
+with the multiqueue (mq) policy.
+
+The smq policy (vs mq) offers the promise of less memory utilization,
+improved performance and increased adaptability in the face of changing
+workloads.  SMQ also does not have any cumbersome tuning knobs.
+
+Users may switch from "mq" to "smq" simply by appropriately reloading a
+DM table that is using the cache target.  Doing so will cause all of the
+mq policy's hints to be dropped.  Also, performance of the cache may
+degrade slightly until smq recalculates the origin device's hotspots
+that should be cached.
+
+Memory usage:
+The mq policy uses a lot of memory; 88 bytes per cache block on a 64
+bit machine.
+
+SMQ uses 28bit indexes to implement it's data structures rather than
+pointers.  It avoids storing an explicit hit count for each block.  It
+has a 'hotspot' queue rather than a pre cache which uses a quarter of
+the entries (each hotspot block covers a larger area than a single
+cache block).
+
+All these mean smq uses ~25bytes per cache block.  Still a lot of
+memory, but a substantial improvement nontheless.
+
+Level balancing:
+MQ places entries in different levels of the multiqueue structures
+based on their hit count (~ln(hit count)).  This means the bottom
+levels generally have the most entries, and the top ones have very
+few.  Having unbalanced levels like this reduces the efficacy of the
+multiqueue.
+
+SMQ does not maintain a hit count, instead it swaps hit entries with
+the least recently used entry from the level above.  The over all
+ordering being a side effect of this stochastic process.  With this
+scheme we can decide how many entries occupy each multiqueue level,
+resulting in better promotion/demotion decisions.
+
+Adaptability:
+The MQ policy maintains a hit count for each cache block.  For a
+different block to get promoted to the cache it's hit count has to
+exceed the lowest currently in the cache.  This means it can take a
+long time for the cache to adapt between varying IO patterns.
+Periodically degrading the hit counts could help with this, but I
+haven't found a nice general solution.
+
+SMQ doesn't maintain hit counts, so a lot of this problem just goes
+away.  In addition it tracks performance of the hotspot queue, which
+is used to decide which blocks to promote.  If the hotspot queue is
+performing badly then it starts moving entries more quickly between
+levels.  This lets it adapt to new IO patterns very quickly.
+
+Performance:
+Testing SMQ shows substantially better performance than MQ.
+
 cleaner
 -------
 
index 68c0f51..82960cf 100644 (file)
@@ -221,6 +221,7 @@ Status
 <#read hits> <#read misses> <#write hits> <#write misses>
 <#demotions> <#promotions> <#dirty> <#features> <features>*
 <#core args> <core args>* <policy name> <#policy args> <policy args>*
+<cache metadata mode>
 
 metadata block size     : Fixed block size for each metadata block in
                             sectors
@@ -251,8 +252,12 @@ core args           : Key/value pairs for tuning the core
                             e.g. migration_threshold
 policy name             : Name of the policy
 #policy args            : Number of policy arguments to follow (must be even)
-policy args             : Key/value pairs
-                            e.g. sequential_threshold
+policy args             : Key/value pairs e.g. sequential_threshold
+cache metadata mode      : ro if read-only, rw if read-write
+       In serious cases where even a read-only mode is deemed unsafe
+       no further I/O will be permitted and the status will just
+       contain the string 'Fail'.  The userspace recovery tools
+       should then be used.
 
 Messages
 --------
index ef8ba9f..cb12af3 100644 (file)
@@ -224,3 +224,5 @@ Version History
        New status (STATUSTYPE_INFO) fields: sync_action and mismatch_cnt.
 1.5.1   Add ability to restore transiently failed devices on resume.
 1.5.2   'mismatch_cnt' is zero unless [last_]sync_action is "check".
+1.6.0   Add discard support (and devices_handle_discard_safely module param).
+1.7.0   Add support for MD RAID0 mappings.
index 2a1673a..4919b2d 100644 (file)
@@ -13,9 +13,14 @@ the range specified.
 The I/O statistics counters for each step-sized area of a region are
 in the same format as /sys/block/*/stat or /proc/diskstats (see:
 Documentation/iostats.txt).  But two extra counters (12 and 13) are
-provided: total time spent reading and writing in milliseconds.         All
-these counters may be accessed by sending the @stats_print message to
-the appropriate DM device via dmsetup.
+provided: total time spent reading and writing.  When the histogram
+argument is used, the 14th parameter is reported that represents the
+histogram of latencies.  All these counters may be accessed by sending
+the @stats_print message to the appropriate DM device via dmsetup.
+
+The reported times are in milliseconds and the granularity depends on
+the kernel ticks.  When the option precise_timestamps is used, the
+reported times are in nanoseconds.
 
 Each region has a corresponding unique identifier, which we call a
 region_id, that is assigned when the region is created.         The region_id
@@ -33,7 +38,9 @@ memory is used by reading
 Messages
 ========
 
-    @stats_create <range> <step> [<program_id> [<aux_data>]]
+    @stats_create <range> <step>
+               [<number_of_optional_arguments> <optional_arguments>...]
+               [<program_id> [<aux_data>]]
 
        Create a new region and return the region_id.
 
@@ -48,6 +55,29 @@ Messages
          "/<number_of_areas>" - the range is subdivided into the specified
                                 number of areas.
 
+       <number_of_optional_arguments>
+         The number of optional arguments
+
+       <optional_arguments>
+         The following optional arguments are supported
+         precise_timestamps - use precise timer with nanosecond resolution
+               instead of the "jiffies" variable.  When this argument is
+               used, the resulting times are in nanoseconds instead of
+               milliseconds.  Precise timestamps are a little bit slower
+               to obtain than jiffies-based timestamps.
+         histogram:n1,n2,n3,n4,... - collect histogram of latencies.  The
+               numbers n1, n2, etc are times that represent the boundaries
+               of the histogram.  If precise_timestamps is not used, the
+               times are in milliseconds, otherwise they are in
+               nanoseconds.  For each range, the kernel will report the
+               number of requests that completed within this range. For
+               example, if we use "histogram:10,20,30", the kernel will
+               report four numbers a:b:c:d. a is the number of requests
+               that took 0-10 ms to complete, b is the number of requests
+               that took 10-20 ms to complete, c is the number of requests
+               that took 20-30 ms to complete and d is the number of
+               requests that took more than 30 ms to complete.
+
        <program_id>
          An optional parameter.  A name that uniquely identifies
          the userspace owner of the range.  This groups ranges together
@@ -55,6 +85,9 @@ Messages
          created and ignore those created by others.
          The kernel returns this string back in the output of
          @stats_list message, but it doesn't use it for anything else.
+         If we omit the number of optional arguments, program id must not
+         be a number, otherwise it would be interpreted as the number of
+         optional arguments.
 
        <aux_data>
          An optional parameter.  A word that provides auxiliary data
index edcf4ab..b597273 100644 (file)
@@ -304,6 +304,18 @@ config DM_CACHE_MQ
          This is meant to be a general purpose policy.  It prioritises
          reads over writes.
 
+config DM_CACHE_SMQ
+       tristate "Stochastic MQ Cache Policy (EXPERIMENTAL)"
+       depends on DM_CACHE
+       default y
+       ---help---
+         A cache policy that uses a multiqueue ordered by recent hits
+         to select which blocks should be promoted and demoted.
+         This is meant to be a general purpose policy.  It prioritises
+         reads over writes.  This SMQ policy (vs MQ) offers the promise
+         of less memory utilization, improved performance and increased
+         adaptability in the face of changing workloads.
+
 config DM_CACHE_CLEANER
        tristate "Cleaner Cache Policy (EXPERIMENTAL)"
        depends on DM_CACHE
index dba4db5..462f443 100644 (file)
@@ -13,6 +13,7 @@ dm-log-userspace-y \
 dm-thin-pool-y += dm-thin.o dm-thin-metadata.o
 dm-cache-y     += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o
 dm-cache-mq-y   += dm-cache-policy-mq.o
+dm-cache-smq-y   += dm-cache-policy-smq.o
 dm-cache-cleaner-y += dm-cache-policy-cleaner.o
 dm-era-y       += dm-era-target.o
 md-mod-y       += md.o bitmap.o
@@ -54,6 +55,7 @@ obj-$(CONFIG_DM_THIN_PROVISIONING)    += dm-thin-pool.o
 obj-$(CONFIG_DM_VERITY)                += dm-verity.o
 obj-$(CONFIG_DM_CACHE)         += dm-cache.o
 obj-$(CONFIG_DM_CACHE_MQ)      += dm-cache-mq.o
+obj-$(CONFIG_DM_CACHE_SMQ)     += dm-cache-smq.o
 obj-$(CONFIG_DM_CACHE_CLEANER) += dm-cache-cleaner.o
 obj-$(CONFIG_DM_ERA)           += dm-era.o
 obj-$(CONFIG_DM_LOG_WRITES)    += dm-log-writes.o
index be06530..cd6d1d2 100644 (file)
@@ -255,6 +255,32 @@ void dm_cell_visit_release(struct dm_bio_prison *prison,
 }
 EXPORT_SYMBOL_GPL(dm_cell_visit_release);
 
+static int __promote_or_release(struct dm_bio_prison *prison,
+                               struct dm_bio_prison_cell *cell)
+{
+       if (bio_list_empty(&cell->bios)) {
+               rb_erase(&cell->node, &prison->cells);
+               return 1;
+       }
+
+       cell->holder = bio_list_pop(&cell->bios);
+       return 0;
+}
+
+int dm_cell_promote_or_release(struct dm_bio_prison *prison,
+                              struct dm_bio_prison_cell *cell)
+{
+       int r;
+       unsigned long flags;
+
+       spin_lock_irqsave(&prison->lock, flags);
+       r = __promote_or_release(prison, cell);
+       spin_unlock_irqrestore(&prison->lock, flags);
+
+       return r;
+}
+EXPORT_SYMBOL_GPL(dm_cell_promote_or_release);
+
 /*----------------------------------------------------------------*/
 
 #define DEFERRED_SET_SIZE 64
index 74cf011..54352f0 100644 (file)
@@ -101,6 +101,19 @@ void dm_cell_visit_release(struct dm_bio_prison *prison,
                           void (*visit_fn)(void *, struct dm_bio_prison_cell *),
                           void *context, struct dm_bio_prison_cell *cell);
 
+/*
+ * Rather than always releasing the prisoners in a cell, the client may
+ * want to promote one of them to be the new holder.  There is a race here
+ * though between releasing an empty cell, and other threads adding new
+ * inmates.  So this function makes the decision with its lock held.
+ *
+ * This function can have two outcomes:
+ * i) An inmate is promoted to be the holder of the cell (return value of 0).
+ * ii) The cell has no inmate for promotion and is released (return value of 1).
+ */
+int dm_cell_promote_or_release(struct dm_bio_prison *prison,
+                              struct dm_bio_prison_cell *cell);
+
 /*----------------------------------------------------------------*/
 
 /*
index c1c0104..20cc36b 100644 (file)
@@ -39,6 +39,8 @@
 enum superblock_flag_bits {
        /* for spotting crashes that would invalidate the dirty bitset */
        CLEAN_SHUTDOWN,
+       /* metadata must be checked using the tools */
+       NEEDS_CHECK,
 };
 
 /*
@@ -107,6 +109,7 @@ struct dm_cache_metadata {
        struct dm_disk_bitset discard_info;
 
        struct rw_semaphore root_lock;
+       unsigned long flags;
        dm_block_t root;
        dm_block_t hint_root;
        dm_block_t discard_root;
@@ -129,6 +132,14 @@ struct dm_cache_metadata {
         * buffer before the superblock is locked and updated.
         */
        __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
+
+       /*
+        * Set if a transaction has to be aborted but the attempt to roll
+        * back to the previous (good) transaction failed.  The only
+        * metadata operation permissible in this state is the closing of
+        * the device.
+        */
+       bool fail_io:1;
 };
 
 /*-------------------------------------------------------------------
@@ -527,6 +538,7 @@ static unsigned long clear_clean_shutdown(unsigned long flags)
 static void read_superblock_fields(struct dm_cache_metadata *cmd,
                                   struct cache_disk_superblock *disk_super)
 {
+       cmd->flags = le32_to_cpu(disk_super->flags);
        cmd->root = le64_to_cpu(disk_super->mapping_root);
        cmd->hint_root = le64_to_cpu(disk_super->hint_root);
        cmd->discard_root = le64_to_cpu(disk_super->discard_root);
@@ -625,6 +637,7 @@ static int __commit_transaction(struct dm_cache_metadata *cmd,
        if (mutator)
                update_flags(disk_super, mutator);
 
+       disk_super->flags = cpu_to_le32(cmd->flags);
        disk_super->mapping_root = cpu_to_le64(cmd->root);
        disk_super->hint_root = cpu_to_le64(cmd->hint_root);
        disk_super->discard_root = cpu_to_le64(cmd->discard_root);
@@ -693,6 +706,7 @@ static struct dm_cache_metadata *metadata_open(struct block_device *bdev,
        cmd->cache_blocks = 0;
        cmd->policy_hint_size = policy_hint_size;
        cmd->changed = true;
+       cmd->fail_io = false;
 
        r = __create_persistent_data_objects(cmd, may_format_device);
        if (r) {
@@ -796,7 +810,8 @@ void dm_cache_metadata_close(struct dm_cache_metadata *cmd)
                list_del(&cmd->list);
                mutex_unlock(&table_lock);
 
-               __destroy_persistent_data_objects(cmd);
+               if (!cmd->fail_io)
+                       __destroy_persistent_data_objects(cmd);
                kfree(cmd);
        }
 }
@@ -848,13 +863,26 @@ static int blocks_are_unmapped_or_clean(struct dm_cache_metadata *cmd,
        return 0;
 }
 
+#define WRITE_LOCK(cmd) \
+       if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) \
+               return -EINVAL; \
+       down_write(&cmd->root_lock)
+
+#define WRITE_LOCK_VOID(cmd) \
+       if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) \
+               return; \
+       down_write(&cmd->root_lock)
+
+#define WRITE_UNLOCK(cmd) \
+       up_write(&cmd->root_lock)
+
 int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size)
 {
        int r;
        bool clean;
        __le64 null_mapping = pack_value(0, 0);
 
-       down_write(&cmd->root_lock);
+       WRITE_LOCK(cmd);
        __dm_bless_for_disk(&null_mapping);
 
        if (from_cblock(new_cache_size) < from_cblock(cmd->cache_blocks)) {
@@ -880,7 +908,7 @@ int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size)
        cmd->changed = true;
 
 out:
-       up_write(&cmd->root_lock);
+       WRITE_UNLOCK(cmd);
 
        return r;
 }
@@ -891,7 +919,7 @@ int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd,
 {
        int r;
 
-       down_write(&cmd->root_lock);
+       WRITE_LOCK(cmd);
        r = dm_bitset_resize(&cmd->discard_info,
                             cmd->discard_root,
                             from_dblock(cmd->discard_nr_blocks),
@@ -903,7 +931,7 @@ int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd,
        }
 
        cmd->changed = true;
-       up_write(&cmd->root_lock);
+       WRITE_UNLOCK(cmd);
 
        return r;
 }
@@ -946,9 +974,9 @@ int dm_cache_set_discard(struct dm_cache_metadata *cmd,
 {
        int r;
 
-       down_write(&cmd->root_lock);
+       WRITE_LOCK(cmd);
        r = __discard(cmd, dblock, discard);
-       up_write(&cmd->root_lock);
+       WRITE_UNLOCK(cmd);
 
        return r;
 }
@@ -1020,9 +1048,9 @@ int dm_cache_remove_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock)
 {
        int r;
 
-       down_write(&cmd->root_lock);
+       WRITE_LOCK(cmd);
        r = __remove(cmd, cblock);
-       up_write(&cmd->root_lock);
+       WRITE_UNLOCK(cmd);
 
        return r;
 }
@@ -1048,9 +1076,9 @@ int dm_cache_insert_mapping(struct dm_cache_metadata *cmd,
 {
        int r;
 
-       down_write(&cmd->root_lock);
+       WRITE_LOCK(cmd);
        r = __insert(cmd, cblock, oblock);
-       up_write(&cmd->root_lock);
+       WRITE_UNLOCK(cmd);
 
        return r;
 }
@@ -1234,9 +1262,9 @@ int dm_cache_set_dirty(struct dm_cache_metadata *cmd,
 {
        int r;
 
-       down_write(&cmd->root_lock);
+       WRITE_LOCK(cmd);
        r = __dirty(cmd, cblock, dirty);
-       up_write(&cmd->root_lock);
+       WRITE_UNLOCK(cmd);
 
        return r;
 }
@@ -1252,9 +1280,9 @@ void dm_cache_metadata_get_stats(struct dm_cache_metadata *cmd,
 void dm_cache_metadata_set_stats(struct dm_cache_metadata *cmd,
                                 struct dm_cache_statistics *stats)
 {
-       down_write(&cmd->root_lock);
+       WRITE_LOCK_VOID(cmd);
        cmd->stats = *stats;
-       up_write(&cmd->root_lock);
+       WRITE_UNLOCK(cmd);
 }
 
 int dm_cache_commit(struct dm_cache_metadata *cmd, bool clean_shutdown)
@@ -1263,7 +1291,7 @@ int dm_cache_commit(struct dm_cache_metadata *cmd, bool clean_shutdown)
        flags_mutator mutator = (clean_shutdown ? set_clean_shutdown :
                                 clear_clean_shutdown);
 
-       down_write(&cmd->root_lock);
+       WRITE_LOCK(cmd);
        r = __commit_transaction(cmd, mutator);
        if (r)
                goto out;
@@ -1271,7 +1299,7 @@ int dm_cache_commit(struct dm_cache_metadata *cmd, bool clean_shutdown)
        r = __begin_transaction(cmd);
 
 out:
-       up_write(&cmd->root_lock);
+       WRITE_UNLOCK(cmd);
        return r;
 }
 
@@ -1376,9 +1404,9 @@ int dm_cache_write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *
 {
        int r;
 
-       down_write(&cmd->root_lock);
+       WRITE_LOCK(cmd);
        r = write_hints(cmd, policy);
-       up_write(&cmd->root_lock);
+       WRITE_UNLOCK(cmd);
 
        return r;
 }
@@ -1387,3 +1415,70 @@ int dm_cache_metadata_all_clean(struct dm_cache_metadata *cmd, bool *result)
 {
        return blocks_are_unmapped_or_clean(cmd, 0, cmd->cache_blocks, result);
 }
+
+void dm_cache_metadata_set_read_only(struct dm_cache_metadata *cmd)
+{
+       WRITE_LOCK_VOID(cmd);
+       dm_bm_set_read_only(cmd->bm);
+       WRITE_UNLOCK(cmd);
+}
+
+void dm_cache_metadata_set_read_write(struct dm_cache_metadata *cmd)
+{
+       WRITE_LOCK_VOID(cmd);
+       dm_bm_set_read_write(cmd->bm);
+       WRITE_UNLOCK(cmd);
+}
+
+int dm_cache_metadata_set_needs_check(struct dm_cache_metadata *cmd)
+{
+       int r;
+       struct dm_block *sblock;
+       struct cache_disk_superblock *disk_super;
+
+       /*
+        * We ignore fail_io for this function.
+        */
+       down_write(&cmd->root_lock);
+       set_bit(NEEDS_CHECK, &cmd->flags);
+
+       r = superblock_lock(cmd, &sblock);
+       if (r) {
+               DMERR("couldn't read superblock");
+               goto out;
+       }
+
+       disk_super = dm_block_data(sblock);
+       disk_super->flags = cpu_to_le32(cmd->flags);
+
+       dm_bm_unlock(sblock);
+
+out:
+       up_write(&cmd->root_lock);
+       return r;
+}
+
+bool dm_cache_metadata_needs_check(struct dm_cache_metadata *cmd)
+{
+       bool needs_check;
+
+       down_read(&cmd->root_lock);
+       needs_check = !!test_bit(NEEDS_CHECK, &cmd->flags);
+       up_read(&cmd->root_lock);
+
+       return needs_check;
+}
+
+int dm_cache_metadata_abort(struct dm_cache_metadata *cmd)
+{
+       int r;
+
+       WRITE_LOCK(cmd);
+       __destroy_persistent_data_objects(cmd);
+       r = __create_persistent_data_objects(cmd, false);
+       if (r)
+               cmd->fail_io = true;
+       WRITE_UNLOCK(cmd);
+
+       return r;
+}
index 4ecc403..2ffee21 100644 (file)
@@ -102,6 +102,10 @@ struct dm_cache_statistics {
 
 void dm_cache_metadata_get_stats(struct dm_cache_metadata *cmd,
                                 struct dm_cache_statistics *stats);
+
+/*
+ * 'void' because it's no big deal if it fails.
+ */
 void dm_cache_metadata_set_stats(struct dm_cache_metadata *cmd,
                                 struct dm_cache_statistics *stats);
 
@@ -133,6 +137,12 @@ int dm_cache_write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *
  */
 int dm_cache_metadata_all_clean(struct dm_cache_metadata *cmd, bool *result);
 
+bool dm_cache_metadata_needs_check(struct dm_cache_metadata *cmd);
+int dm_cache_metadata_set_needs_check(struct dm_cache_metadata *cmd);
+void dm_cache_metadata_set_read_only(struct dm_cache_metadata *cmd);
+void dm_cache_metadata_set_read_write(struct dm_cache_metadata *cmd);
+int dm_cache_metadata_abort(struct dm_cache_metadata *cmd);
+
 /*----------------------------------------------------------------*/
 
 #endif /* DM_CACHE_METADATA_H */
index b04d1f9..240c9f0 100644 (file)
@@ -171,7 +171,8 @@ static void remove_cache_hash_entry(struct wb_cache_entry *e)
 /* Public interface (see dm-cache-policy.h */
 static int wb_map(struct dm_cache_policy *pe, dm_oblock_t oblock,
                  bool can_block, bool can_migrate, bool discarded_oblock,
-                 struct bio *bio, struct policy_result *result)
+                 struct bio *bio, struct policy_locker *locker,
+                 struct policy_result *result)
 {
        struct policy *p = to_policy(pe);
        struct wb_cache_entry *e;
@@ -358,7 +359,8 @@ static struct wb_cache_entry *get_next_dirty_entry(struct policy *p)
 
 static int wb_writeback_work(struct dm_cache_policy *pe,
                             dm_oblock_t *oblock,
-                            dm_cblock_t *cblock)
+                            dm_cblock_t *cblock,
+                            bool critical_only)
 {
        int r = -ENOENT;
        struct policy *p = to_policy(pe);
index 2256a1f..2816018 100644 (file)
@@ -7,6 +7,7 @@
 #ifndef DM_CACHE_POLICY_INTERNAL_H
 #define DM_CACHE_POLICY_INTERNAL_H
 
+#include <linux/vmalloc.h>
 #include "dm-cache-policy.h"
 
 /*----------------------------------------------------------------*/
  */
 static inline int policy_map(struct dm_cache_policy *p, dm_oblock_t oblock,
                             bool can_block, bool can_migrate, bool discarded_oblock,
-                            struct bio *bio, struct policy_result *result)
+                            struct bio *bio, struct policy_locker *locker,
+                            struct policy_result *result)
 {
-       return p->map(p, oblock, can_block, can_migrate, discarded_oblock, bio, result);
+       return p->map(p, oblock, can_block, can_migrate, discarded_oblock, bio, locker, result);
 }
 
 static inline int policy_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock)
@@ -54,9 +56,10 @@ static inline int policy_walk_mappings(struct dm_cache_policy *p,
 
 static inline int policy_writeback_work(struct dm_cache_policy *p,
                                        dm_oblock_t *oblock,
-                                       dm_cblock_t *cblock)
+                                       dm_cblock_t *cblock,
+                                       bool critical_only)
 {
-       return p->writeback_work ? p->writeback_work(p, oblock, cblock) : -ENOENT;
+       return p->writeback_work ? p->writeback_work(p, oblock, cblock, critical_only) : -ENOENT;
 }
 
 static inline void policy_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
@@ -80,19 +83,21 @@ static inline dm_cblock_t policy_residency(struct dm_cache_policy *p)
        return p->residency(p);
 }
 
-static inline void policy_tick(struct dm_cache_policy *p)
+static inline void policy_tick(struct dm_cache_policy *p, bool can_block)
 {
        if (p->tick)
-               return p->tick(p);
+               return p->tick(p, can_block);
 }
 
-static inline int policy_emit_config_values(struct dm_cache_policy *p, char *result, unsigned maxlen)
+static inline int policy_emit_config_values(struct dm_cache_policy *p, char *result,
+                                           unsigned maxlen, ssize_t *sz_ptr)
 {
-       ssize_t sz = 0;
+       ssize_t sz = *sz_ptr;
        if (p->emit_config_values)
-               return p->emit_config_values(p, result, maxlen);
+               return p->emit_config_values(p, result, maxlen, sz_ptr);
 
-       DMEMIT("0");
+       DMEMIT("0 ");
+       *sz_ptr = sz;
        return 0;
 }
 
@@ -105,6 +110,33 @@ static inline int policy_set_config_value(struct dm_cache_policy *p,
 /*----------------------------------------------------------------*/
 
 /*
+ * Some utility functions commonly used by policies and the core target.
+ */
+static inline size_t bitset_size_in_bytes(unsigned nr_entries)
+{
+       return sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG);
+}
+
+static inline unsigned long *alloc_bitset(unsigned nr_entries)
+{
+       size_t s = bitset_size_in_bytes(nr_entries);
+       return vzalloc(s);
+}
+
+static inline void clear_bitset(void *bitset, unsigned nr_entries)
+{
+       size_t s = bitset_size_in_bytes(nr_entries);
+       memset(bitset, 0, s);
+}
+
+static inline void free_bitset(unsigned long *bits)
+{
+       vfree(bits);
+}
+
+/*----------------------------------------------------------------*/
+
+/*
  * Creates a new cache policy given a policy name, a cache size, an origin size and the block size.
  */
 struct dm_cache_policy *dm_cache_policy_create(const char *name, dm_cblock_t cache_size,
index 3ddd116..3281437 100644 (file)
@@ -693,9 +693,10 @@ static void requeue(struct mq_policy *mq, struct entry *e)
  * - set the hit count to a hard coded value other than 1, eg, is it better
  *   if it goes in at level 2?
  */
-static int demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock)
+static int demote_cblock(struct mq_policy *mq,
+                        struct policy_locker *locker, dm_oblock_t *oblock)
 {
-       struct entry *demoted = pop(mq, &mq->cache_clean);
+       struct entry *demoted = peek(&mq->cache_clean);
 
        if (!demoted)
                /*
@@ -707,6 +708,13 @@ static int demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock)
                 */
                return -ENOSPC;
 
+       if (locker->fn(locker, demoted->oblock))
+               /*
+                * We couldn't lock the demoted block.
+                */
+               return -EBUSY;
+
+       del(mq, demoted);
        *oblock = demoted->oblock;
        free_entry(&mq->cache_pool, demoted);
 
@@ -795,6 +803,7 @@ static int cache_entry_found(struct mq_policy *mq,
  * finding which cache block to use.
  */
 static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e,
+                             struct policy_locker *locker,
                              struct policy_result *result)
 {
        int r;
@@ -803,11 +812,12 @@ static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e,
        /* Ensure there's a free cblock in the cache */
        if (epool_empty(&mq->cache_pool)) {
                result->op = POLICY_REPLACE;
-               r = demote_cblock(mq, &result->old_oblock);
+               r = demote_cblock(mq, locker, &result->old_oblock);
                if (r) {
                        result->op = POLICY_MISS;
                        return 0;
                }
+
        } else
                result->op = POLICY_NEW;
 
@@ -829,7 +839,8 @@ static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e,
 
 static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e,
                                 bool can_migrate, bool discarded_oblock,
-                                int data_dir, struct policy_result *result)
+                                int data_dir, struct policy_locker *locker,
+                                struct policy_result *result)
 {
        int r = 0;
 
@@ -842,7 +853,7 @@ static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e,
 
        else {
                requeue(mq, e);
-               r = pre_cache_to_cache(mq, e, result);
+               r = pre_cache_to_cache(mq, e, locker, result);
        }
 
        return r;
@@ -872,6 +883,7 @@ static void insert_in_pre_cache(struct mq_policy *mq,
 }
 
 static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock,
+                           struct policy_locker *locker,
                            struct policy_result *result)
 {
        int r;
@@ -879,7 +891,7 @@ static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock,
 
        if (epool_empty(&mq->cache_pool)) {
                result->op = POLICY_REPLACE;
-               r = demote_cblock(mq, &result->old_oblock);
+               r = demote_cblock(mq, locker, &result->old_oblock);
                if (unlikely(r)) {
                        result->op = POLICY_MISS;
                        insert_in_pre_cache(mq, oblock);
@@ -907,11 +919,12 @@ static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock,
 
 static int no_entry_found(struct mq_policy *mq, dm_oblock_t oblock,
                          bool can_migrate, bool discarded_oblock,
-                         int data_dir, struct policy_result *result)
+                         int data_dir, struct policy_locker *locker,
+                         struct policy_result *result)
 {
        if (adjusted_promote_threshold(mq, discarded_oblock, data_dir) <= 1) {
                if (can_migrate)
-                       insert_in_cache(mq, oblock, result);
+                       insert_in_cache(mq, oblock, locker, result);
                else
                        return -EWOULDBLOCK;
        } else {
@@ -928,7 +941,8 @@ static int no_entry_found(struct mq_policy *mq, dm_oblock_t oblock,
  */
 static int map(struct mq_policy *mq, dm_oblock_t oblock,
               bool can_migrate, bool discarded_oblock,
-              int data_dir, struct policy_result *result)
+              int data_dir, struct policy_locker *locker,
+              struct policy_result *result)
 {
        int r = 0;
        struct entry *e = hash_lookup(mq, oblock);
@@ -942,11 +956,11 @@ static int map(struct mq_policy *mq, dm_oblock_t oblock,
 
        else if (e)
                r = pre_cache_entry_found(mq, e, can_migrate, discarded_oblock,
-                                         data_dir, result);
+                                         data_dir, locker, result);
 
        else
                r = no_entry_found(mq, oblock, can_migrate, discarded_oblock,
-                                  data_dir, result);
+                                  data_dir, locker, result);
 
        if (r == -EWOULDBLOCK)
                result->op = POLICY_MISS;
@@ -1012,7 +1026,8 @@ static void copy_tick(struct mq_policy *mq)
 
 static int mq_map(struct dm_cache_policy *p, dm_oblock_t oblock,
                  bool can_block, bool can_migrate, bool discarded_oblock,
-                 struct bio *bio, struct policy_result *result)
+                 struct bio *bio, struct policy_locker *locker,
+                 struct policy_result *result)
 {
        int r;
        struct mq_policy *mq = to_mq_policy(p);
@@ -1028,7 +1043,7 @@ static int mq_map(struct dm_cache_policy *p, dm_oblock_t oblock,
 
        iot_examine_bio(&mq->tracker, bio);
        r = map(mq, oblock, can_migrate, discarded_oblock,
-               bio_data_dir(bio), result);
+               bio_data_dir(bio), locker, result);
 
        mutex_unlock(&mq->lock);
 
@@ -1221,7 +1236,7 @@ static int __mq_writeback_work(struct mq_policy *mq, dm_oblock_t *oblock,
 }
 
 static int mq_writeback_work(struct dm_cache_policy *p, dm_oblock_t *oblock,
-                            dm_cblock_t *cblock)
+                            dm_cblock_t *cblock, bool critical_only)
 {
        int r;
        struct mq_policy *mq = to_mq_policy(p);
@@ -1268,7 +1283,7 @@ static dm_cblock_t mq_residency(struct dm_cache_policy *p)
        return r;
 }
 
-static void mq_tick(struct dm_cache_policy *p)
+static void mq_tick(struct dm_cache_policy *p, bool can_block)
 {
        struct mq_policy *mq = to_mq_policy(p);
        unsigned long flags;
@@ -1276,6 +1291,12 @@ static void mq_tick(struct dm_cache_policy *p)
        spin_lock_irqsave(&mq->tick_lock, flags);
        mq->tick_protected++;
        spin_unlock_irqrestore(&mq->tick_lock, flags);
+
+       if (can_block) {
+               mutex_lock(&mq->lock);
+               copy_tick(mq);
+               mutex_unlock(&mq->lock);
+       }
 }
 
 static int mq_set_config_value(struct dm_cache_policy *p,
@@ -1308,22 +1329,24 @@ static int mq_set_config_value(struct dm_cache_policy *p,
        return 0;
 }
 
-static int mq_emit_config_values(struct dm_cache_policy *p, char *result, unsigned maxlen)
+static int mq_emit_config_values(struct dm_cache_policy *p, char *result,
+                                unsigned maxlen, ssize_t *sz_ptr)
 {
-       ssize_t sz = 0;
+       ssize_t sz = *sz_ptr;
        struct mq_policy *mq = to_mq_policy(p);
 
        DMEMIT("10 random_threshold %u "
               "sequential_threshold %u "
               "discard_promote_adjustment %u "
               "read_promote_adjustment %u "
-              "write_promote_adjustment %u",
+              "write_promote_adjustment %u ",
               mq->tracker.thresholds[PATTERN_RANDOM],
               mq->tracker.thresholds[PATTERN_SEQUENTIAL],
               mq->discard_promote_adjustment,
               mq->read_promote_adjustment,
               mq->write_promote_adjustment);
 
+       *sz_ptr = sz;
        return 0;
 }
 
@@ -1408,21 +1431,12 @@ bad_pre_cache_init:
 
 static struct dm_cache_policy_type mq_policy_type = {
        .name = "mq",
-       .version = {1, 3, 0},
+       .version = {1, 4, 0},
        .hint_size = 4,
        .owner = THIS_MODULE,
        .create = mq_create
 };
 
-static struct dm_cache_policy_type default_policy_type = {
-       .name = "default",
-       .version = {1, 3, 0},
-       .hint_size = 4,
-       .owner = THIS_MODULE,
-       .create = mq_create,
-       .real = &mq_policy_type
-};
-
 static int __init mq_init(void)
 {
        int r;
@@ -1432,36 +1446,21 @@ static int __init mq_init(void)
                                           __alignof__(struct entry),
                                           0, NULL);
        if (!mq_entry_cache)
-               goto bad;
+               return -ENOMEM;
 
        r = dm_cache_policy_register(&mq_policy_type);
        if (r) {
                DMERR("register failed %d", r);
-               goto bad_register_mq;
-       }
-
-       r = dm_cache_policy_register(&default_policy_type);
-       if (!r) {
-               DMINFO("version %u.%u.%u loaded",
-                      mq_policy_type.version[0],
-                      mq_policy_type.version[1],
-                      mq_policy_type.version[2]);
-               return 0;
+               kmem_cache_destroy(mq_entry_cache);
+               return -ENOMEM;
        }
 
-       DMERR("register failed (as default) %d", r);
-
-       dm_cache_policy_unregister(&mq_policy_type);
-bad_register_mq:
-       kmem_cache_destroy(mq_entry_cache);
-bad:
-       return -ENOMEM;
+       return 0;
 }
 
 static void __exit mq_exit(void)
 {
        dm_cache_policy_unregister(&mq_policy_type);
-       dm_cache_policy_unregister(&default_policy_type);
 
        kmem_cache_destroy(mq_entry_cache);
 }
diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c
new file mode 100644 (file)
index 0000000..80f02d3
--- /dev/null
@@ -0,0 +1,1791 @@
+/*
+ * Copyright (C) 2015 Red Hat. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-cache-policy.h"
+#include "dm-cache-policy-internal.h"
+#include "dm.h"
+
+#include <linux/hash.h>
+#include <linux/jiffies.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/vmalloc.h>
+#include <linux/math64.h>
+
+#define DM_MSG_PREFIX "cache-policy-smq"
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Safe division functions that return zero on divide by zero.
+ */
+static unsigned safe_div(unsigned n, unsigned d)
+{
+       return d ? n / d : 0u;
+}
+
+static unsigned safe_mod(unsigned n, unsigned d)
+{
+       return d ? n % d : 0u;
+}
+
+/*----------------------------------------------------------------*/
+
+struct entry {
+       unsigned hash_next:28;
+       unsigned prev:28;
+       unsigned next:28;
+       unsigned level:7;
+       bool dirty:1;
+       bool allocated:1;
+       bool sentinel:1;
+
+       dm_oblock_t oblock;
+};
+
+/*----------------------------------------------------------------*/
+
+#define INDEXER_NULL ((1u << 28u) - 1u)
+
+/*
+ * An entry_space manages a set of entries that we use for the queues.
+ * The clean and dirty queues share entries, so this object is separate
+ * from the queue itself.
+ */
+struct entry_space {
+       struct entry *begin;
+       struct entry *end;
+};
+
+static int space_init(struct entry_space *es, unsigned nr_entries)
+{
+       if (!nr_entries) {
+               es->begin = es->end = NULL;
+               return 0;
+       }
+
+       es->begin = vzalloc(sizeof(struct entry) * nr_entries);
+       if (!es->begin)
+               return -ENOMEM;
+
+       es->end = es->begin + nr_entries;
+       return 0;
+}
+
+static void space_exit(struct entry_space *es)
+{
+       vfree(es->begin);
+}
+
+static struct entry *__get_entry(struct entry_space *es, unsigned block)
+{
+       struct entry *e;
+
+       e = es->begin + block;
+       BUG_ON(e >= es->end);
+
+       return e;
+}
+
+static unsigned to_index(struct entry_space *es, struct entry *e)
+{
+       BUG_ON(e < es->begin || e >= es->end);
+       return e - es->begin;
+}
+
+static struct entry *to_entry(struct entry_space *es, unsigned block)
+{
+       if (block == INDEXER_NULL)
+               return NULL;
+
+       return __get_entry(es, block);
+}
+
+/*----------------------------------------------------------------*/
+
+struct ilist {
+       unsigned nr_elts;       /* excluding sentinel entries */
+       unsigned head, tail;
+};
+
+static void l_init(struct ilist *l)
+{
+       l->nr_elts = 0;
+       l->head = l->tail = INDEXER_NULL;
+}
+
+static struct entry *l_head(struct entry_space *es, struct ilist *l)
+{
+       return to_entry(es, l->head);
+}
+
+static struct entry *l_tail(struct entry_space *es, struct ilist *l)
+{
+       return to_entry(es, l->tail);
+}
+
+static struct entry *l_next(struct entry_space *es, struct entry *e)
+{
+       return to_entry(es, e->next);
+}
+
+static struct entry *l_prev(struct entry_space *es, struct entry *e)
+{
+       return to_entry(es, e->prev);
+}
+
+static bool l_empty(struct ilist *l)
+{
+       return l->head == INDEXER_NULL;
+}
+
+static void l_add_head(struct entry_space *es, struct ilist *l, struct entry *e)
+{
+       struct entry *head = l_head(es, l);
+
+       e->next = l->head;
+       e->prev = INDEXER_NULL;
+
+       if (head)
+               head->prev = l->head = to_index(es, e);
+       else
+               l->head = l->tail = to_index(es, e);
+
+       if (!e->sentinel)
+               l->nr_elts++;
+}
+
+static void l_add_tail(struct entry_space *es, struct ilist *l, struct entry *e)
+{
+       struct entry *tail = l_tail(es, l);
+
+       e->next = INDEXER_NULL;
+       e->prev = l->tail;
+
+       if (tail)
+               tail->next = l->tail = to_index(es, e);
+       else
+               l->head = l->tail = to_index(es, e);
+
+       if (!e->sentinel)
+               l->nr_elts++;
+}
+
+static void l_add_before(struct entry_space *es, struct ilist *l,
+                        struct entry *old, struct entry *e)
+{
+       struct entry *prev = l_prev(es, old);
+
+       if (!prev)
+               l_add_head(es, l, e);
+
+       else {
+               e->prev = old->prev;
+               e->next = to_index(es, old);
+               prev->next = old->prev = to_index(es, e);
+
+               if (!e->sentinel)
+                       l->nr_elts++;
+       }
+}
+
+static void l_del(struct entry_space *es, struct ilist *l, struct entry *e)
+{
+       struct entry *prev = l_prev(es, e);
+       struct entry *next = l_next(es, e);
+
+       if (prev)
+               prev->next = e->next;
+       else
+               l->head = e->next;
+
+       if (next)
+               next->prev = e->prev;
+       else
+               l->tail = e->prev;
+
+       if (!e->sentinel)
+               l->nr_elts--;
+}
+
+static struct entry *l_pop_tail(struct entry_space *es, struct ilist *l)
+{
+       struct entry *e;
+
+       for (e = l_tail(es, l); e; e = l_prev(es, e))
+               if (!e->sentinel) {
+                       l_del(es, l, e);
+                       return e;
+               }
+
+       return NULL;
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * The stochastic-multi-queue is a set of lru lists stacked into levels.
+ * Entries are moved up levels when they are used, which loosely orders the
+ * most accessed entries in the top levels and least in the bottom.  This
+ * structure is *much* better than a single lru list.
+ */
+#define MAX_LEVELS 64u
+
+struct queue {
+       struct entry_space *es;
+
+       unsigned nr_elts;
+       unsigned nr_levels;
+       struct ilist qs[MAX_LEVELS];
+
+       /*
+        * We maintain a count of the number of entries we would like in each
+        * level.
+        */
+       unsigned last_target_nr_elts;
+       unsigned nr_top_levels;
+       unsigned nr_in_top_levels;
+       unsigned target_count[MAX_LEVELS];
+};
+
+static void q_init(struct queue *q, struct entry_space *es, unsigned nr_levels)
+{
+       unsigned i;
+
+       q->es = es;
+       q->nr_elts = 0;
+       q->nr_levels = nr_levels;
+
+       for (i = 0; i < q->nr_levels; i++) {
+               l_init(q->qs + i);
+               q->target_count[i] = 0u;
+       }
+
+       q->last_target_nr_elts = 0u;
+       q->nr_top_levels = 0u;
+       q->nr_in_top_levels = 0u;
+}
+
+static unsigned q_size(struct queue *q)
+{
+       return q->nr_elts;
+}
+
+/*
+ * Insert an entry to the back of the given level.
+ */
+static void q_push(struct queue *q, struct entry *e)
+{
+       if (!e->sentinel)
+               q->nr_elts++;
+
+       l_add_tail(q->es, q->qs + e->level, e);
+}
+
+static void q_push_before(struct queue *q, struct entry *old, struct entry *e)
+{
+       if (!e->sentinel)
+               q->nr_elts++;
+
+       l_add_before(q->es, q->qs + e->level, old, e);
+}
+
+static void q_del(struct queue *q, struct entry *e)
+{
+       l_del(q->es, q->qs + e->level, e);
+       if (!e->sentinel)
+               q->nr_elts--;
+}
+
+/*
+ * Return the oldest entry of the lowest populated level.
+ */
+static struct entry *q_peek(struct queue *q, unsigned max_level, bool can_cross_sentinel)
+{
+       unsigned level;
+       struct entry *e;
+
+       max_level = min(max_level, q->nr_levels);
+
+       for (level = 0; level < max_level; level++)
+               for (e = l_head(q->es, q->qs + level); e; e = l_next(q->es, e)) {
+                       if (e->sentinel) {
+                               if (can_cross_sentinel)
+                                       continue;
+                               else
+                                       break;
+                       }
+
+                       return e;
+               }
+
+       return NULL;
+}
+
+static struct entry *q_pop(struct queue *q)
+{
+       struct entry *e = q_peek(q, q->nr_levels, true);
+
+       if (e)
+               q_del(q, e);
+
+       return e;
+}
+
+/*
+ * Pops an entry from a level that is not past a sentinel.
+ */
+static struct entry *q_pop_old(struct queue *q, unsigned max_level)
+{
+       struct entry *e = q_peek(q, max_level, false);
+
+       if (e)
+               q_del(q, e);
+
+       return e;
+}
+
+/*
+ * This function assumes there is a non-sentinel entry to pop.  It's only
+ * used by redistribute, so we know this is true.  It also doesn't adjust
+ * the q->nr_elts count.
+ */
+static struct entry *__redist_pop_from(struct queue *q, unsigned level)
+{
+       struct entry *e;
+
+       for (; level < q->nr_levels; level++)
+               for (e = l_head(q->es, q->qs + level); e; e = l_next(q->es, e))
+                       if (!e->sentinel) {
+                               l_del(q->es, q->qs + e->level, e);
+                               return e;
+                       }
+
+       return NULL;
+}
+
+static void q_set_targets_subrange_(struct queue *q, unsigned nr_elts, unsigned lbegin, unsigned lend)
+{
+       unsigned level, nr_levels, entries_per_level, remainder;
+
+       BUG_ON(lbegin > lend);
+       BUG_ON(lend > q->nr_levels);
+       nr_levels = lend - lbegin;
+       entries_per_level = safe_div(nr_elts, nr_levels);
+       remainder = safe_mod(nr_elts, nr_levels);
+
+       for (level = lbegin; level < lend; level++)
+               q->target_count[level] =
+                       (level < (lbegin + remainder)) ? entries_per_level + 1u : entries_per_level;
+}
+
+/*
+ * Typically we have fewer elements in the top few levels which allows us
+ * to adjust the promote threshold nicely.
+ */
+static void q_set_targets(struct queue *q)
+{
+       if (q->last_target_nr_elts == q->nr_elts)
+               return;
+
+       q->last_target_nr_elts = q->nr_elts;
+
+       if (q->nr_top_levels > q->nr_levels)
+               q_set_targets_subrange_(q, q->nr_elts, 0, q->nr_levels);
+
+       else {
+               q_set_targets_subrange_(q, q->nr_in_top_levels,
+                                       q->nr_levels - q->nr_top_levels, q->nr_levels);
+
+               if (q->nr_in_top_levels < q->nr_elts)
+                       q_set_targets_subrange_(q, q->nr_elts - q->nr_in_top_levels,
+                                               0, q->nr_levels - q->nr_top_levels);
+               else
+                       q_set_targets_subrange_(q, 0, 0, q->nr_levels - q->nr_top_levels);
+       }
+}
+
+static void q_redistribute(struct queue *q)
+{
+       unsigned target, level;
+       struct ilist *l, *l_above;
+       struct entry *e;
+
+       q_set_targets(q);
+
+       for (level = 0u; level < q->nr_levels - 1u; level++) {
+               l = q->qs + level;
+               target = q->target_count[level];
+
+               /*
+                * Pull down some entries from the level above.
+                */
+               while (l->nr_elts < target) {
+                       e = __redist_pop_from(q, level + 1u);
+                       if (!e) {
+                               /* bug in nr_elts */
+                               break;
+                       }
+
+                       e->level = level;
+                       l_add_tail(q->es, l, e);
+               }
+
+               /*
+                * Push some entries up.
+                */
+               l_above = q->qs + level + 1u;
+               while (l->nr_elts > target) {
+                       e = l_pop_tail(q->es, l);
+
+                       if (!e)
+                               /* bug in nr_elts */
+                               break;
+
+                       e->level = level + 1u;
+                       l_add_head(q->es, l_above, e);
+               }
+       }
+}
+
+static void q_requeue_before(struct queue *q, struct entry *dest, struct entry *e, unsigned extra_levels)
+{
+       struct entry *de;
+       unsigned new_level;
+
+       q_del(q, e);
+
+       if (extra_levels && (e->level < q->nr_levels - 1u)) {
+               new_level = min(q->nr_levels - 1u, e->level + extra_levels);
+               for (de = l_head(q->es, q->qs + new_level); de; de = l_next(q->es, de)) {
+                       if (de->sentinel)
+                               continue;
+
+                       q_del(q, de);
+                       de->level = e->level;
+
+                       if (dest)
+                               q_push_before(q, dest, de);
+                       else
+                               q_push(q, de);
+                       break;
+               }
+
+               e->level = new_level;
+       }
+
+       q_push(q, e);
+}
+
+static void q_requeue(struct queue *q, struct entry *e, unsigned extra_levels)
+{
+       q_requeue_before(q, NULL, e, extra_levels);
+}
+
+/*----------------------------------------------------------------*/
+
+#define FP_SHIFT 8
+#define SIXTEENTH (1u << (FP_SHIFT - 4u))
+#define EIGHTH (1u << (FP_SHIFT - 3u))
+
+struct stats {
+       unsigned hit_threshold;
+       unsigned hits;
+       unsigned misses;
+};
+
+enum performance {
+       Q_POOR,
+       Q_FAIR,
+       Q_WELL
+};
+
+static void stats_init(struct stats *s, unsigned nr_levels)
+{
+       s->hit_threshold = (nr_levels * 3u) / 4u;
+       s->hits = 0u;
+       s->misses = 0u;
+}
+
+static void stats_reset(struct stats *s)
+{
+       s->hits = s->misses = 0u;
+}
+
+static void stats_level_accessed(struct stats *s, unsigned level)
+{
+       if (level >= s->hit_threshold)
+               s->hits++;
+       else
+               s->misses++;
+}
+
+static void stats_miss(struct stats *s)
+{
+       s->misses++;
+}
+
+/*
+ * There are times when we don't have any confidence in the hotspot queue.
+ * Such as when a fresh cache is created and the blocks have been spread
+ * out across the levels, or if an io load changes.  We detect this by
+ * seeing how often a lookup is in the top levels of the hotspot queue.
+ */
+static enum performance stats_assess(struct stats *s)
+{
+       unsigned confidence = safe_div(s->hits << FP_SHIFT, s->hits + s->misses);
+
+       if (confidence < SIXTEENTH)
+               return Q_POOR;
+
+       else if (confidence < EIGHTH)
+               return Q_FAIR;
+
+       else
+               return Q_WELL;
+}
+
+/*----------------------------------------------------------------*/
+
+struct hash_table {
+       struct entry_space *es;
+       unsigned long long hash_bits;
+       unsigned *buckets;
+};
+
+/*
+ * All cache entries are stored in a chained hash table.  To save space we
+ * use indexing again, and only store indexes to the next entry.
+ */
+static int h_init(struct hash_table *ht, struct entry_space *es, unsigned nr_entries)
+{
+       unsigned i, nr_buckets;
+
+       ht->es = es;
+       nr_buckets = roundup_pow_of_two(max(nr_entries / 4u, 16u));
+       ht->hash_bits = ffs(nr_buckets) - 1;
+
+       ht->buckets = vmalloc(sizeof(*ht->buckets) * nr_buckets);
+       if (!ht->buckets)
+               return -ENOMEM;
+
+       for (i = 0; i < nr_buckets; i++)
+               ht->buckets[i] = INDEXER_NULL;
+
+       return 0;
+}
+
+static void h_exit(struct hash_table *ht)
+{
+       vfree(ht->buckets);
+}
+
+static struct entry *h_head(struct hash_table *ht, unsigned bucket)
+{
+       return to_entry(ht->es, ht->buckets[bucket]);
+}
+
+static struct entry *h_next(struct hash_table *ht, struct entry *e)
+{
+       return to_entry(ht->es, e->hash_next);
+}
+
+static void __h_insert(struct hash_table *ht, unsigned bucket, struct entry *e)
+{
+       e->hash_next = ht->buckets[bucket];
+       ht->buckets[bucket] = to_index(ht->es, e);
+}
+
+static void h_insert(struct hash_table *ht, struct entry *e)
+{
+       unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits);
+       __h_insert(ht, h, e);
+}
+
+static struct entry *__h_lookup(struct hash_table *ht, unsigned h, dm_oblock_t oblock,
+                               struct entry **prev)
+{
+       struct entry *e;
+
+       *prev = NULL;
+       for (e = h_head(ht, h); e; e = h_next(ht, e)) {
+               if (e->oblock == oblock)
+                       return e;
+
+               *prev = e;
+       }
+
+       return NULL;
+}
+
+static void __h_unlink(struct hash_table *ht, unsigned h,
+                      struct entry *e, struct entry *prev)
+{
+       if (prev)
+               prev->hash_next = e->hash_next;
+       else
+               ht->buckets[h] = e->hash_next;
+}
+
+/*
+ * Also moves each entry to the front of the bucket.
+ */
+static struct entry *h_lookup(struct hash_table *ht, dm_oblock_t oblock)
+{
+       struct entry *e, *prev;
+       unsigned h = hash_64(from_oblock(oblock), ht->hash_bits);
+
+       e = __h_lookup(ht, h, oblock, &prev);
+       if (e && prev) {
+               /*
+                * Move to the front because this entry is likely
+                * to be hit again.
+                */
+               __h_unlink(ht, h, e, prev);
+               __h_insert(ht, h, e);
+       }
+
+       return e;
+}
+
+static void h_remove(struct hash_table *ht, struct entry *e)
+{
+       unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits);
+       struct entry *prev;
+
+       /*
+        * The down side of using a singly linked list is we have to
+        * iterate the bucket to remove an item.
+        */
+       e = __h_lookup(ht, h, e->oblock, &prev);
+       if (e)
+               __h_unlink(ht, h, e, prev);
+}
+
+/*----------------------------------------------------------------*/
+
+struct entry_alloc {
+       struct entry_space *es;
+       unsigned begin;
+
+       unsigned nr_allocated;
+       struct ilist free;
+};
+
+static void init_allocator(struct entry_alloc *ea, struct entry_space *es,
+                          unsigned begin, unsigned end)
+{
+       unsigned i;
+
+       ea->es = es;
+       ea->nr_allocated = 0u;
+       ea->begin = begin;
+
+       l_init(&ea->free);
+       for (i = begin; i != end; i++)
+               l_add_tail(ea->es, &ea->free, __get_entry(ea->es, i));
+}
+
+static void init_entry(struct entry *e)
+{
+       /*
+        * We can't memset because that would clear the hotspot and
+        * sentinel bits which remain constant.
+        */
+       e->hash_next = INDEXER_NULL;
+       e->next = INDEXER_NULL;
+       e->prev = INDEXER_NULL;
+       e->level = 0u;
+       e->allocated = true;
+}
+
+static struct entry *alloc_entry(struct entry_alloc *ea)
+{
+       struct entry *e;
+
+       if (l_empty(&ea->free))
+               return NULL;
+
+       e = l_pop_tail(ea->es, &ea->free);
+       init_entry(e);
+       ea->nr_allocated++;
+
+       return e;
+}
+
+/*
+ * This assumes the cblock hasn't already been allocated.
+ */
+static struct entry *alloc_particular_entry(struct entry_alloc *ea, unsigned i)
+{
+       struct entry *e = __get_entry(ea->es, ea->begin + i);
+
+       BUG_ON(e->allocated);
+
+       l_del(ea->es, &ea->free, e);
+       init_entry(e);
+       ea->nr_allocated++;
+
+       return e;
+}
+
+static void free_entry(struct entry_alloc *ea, struct entry *e)
+{
+       BUG_ON(!ea->nr_allocated);
+       BUG_ON(!e->allocated);
+
+       ea->nr_allocated--;
+       e->allocated = false;
+       l_add_tail(ea->es, &ea->free, e);
+}
+
+static bool allocator_empty(struct entry_alloc *ea)
+{
+       return l_empty(&ea->free);
+}
+
+static unsigned get_index(struct entry_alloc *ea, struct entry *e)
+{
+       return to_index(ea->es, e) - ea->begin;
+}
+
+static struct entry *get_entry(struct entry_alloc *ea, unsigned index)
+{
+       return __get_entry(ea->es, ea->begin + index);
+}
+
+/*----------------------------------------------------------------*/
+
+#define NR_HOTSPOT_LEVELS 64u
+#define NR_CACHE_LEVELS 64u
+
+#define WRITEBACK_PERIOD (10 * HZ)
+#define DEMOTE_PERIOD (60 * HZ)
+
+#define HOTSPOT_UPDATE_PERIOD (HZ)
+#define CACHE_UPDATE_PERIOD (10u * HZ)
+
+struct smq_policy {
+       struct dm_cache_policy policy;
+
+       /* protects everything */
+       struct mutex lock;
+       dm_cblock_t cache_size;
+       sector_t cache_block_size;
+
+       sector_t hotspot_block_size;
+       unsigned nr_hotspot_blocks;
+       unsigned cache_blocks_per_hotspot_block;
+       unsigned hotspot_level_jump;
+
+       struct entry_space es;
+       struct entry_alloc writeback_sentinel_alloc;
+       struct entry_alloc demote_sentinel_alloc;
+       struct entry_alloc hotspot_alloc;
+       struct entry_alloc cache_alloc;
+
+       unsigned long *hotspot_hit_bits;
+       unsigned long *cache_hit_bits;
+
+       /*
+        * We maintain three queues of entries.  The cache proper,
+        * consisting of a clean and dirty queue, containing the currently
+        * active mappings.  The hotspot queue uses a larger block size to
+        * track blocks that are being hit frequently and potential
+        * candidates for promotion to the cache.
+        */
+       struct queue hotspot;
+       struct queue clean;
+       struct queue dirty;
+
+       struct stats hotspot_stats;
+       struct stats cache_stats;
+
+       /*
+        * Keeps track of time, incremented by the core.  We use this to
+        * avoid attributing multiple hits within the same tick.
+        *
+        * Access to tick_protected should be done with the spin lock held.
+        * It's copied to tick at the start of the map function (within the
+        * mutex).
+        */
+       spinlock_t tick_lock;
+       unsigned tick_protected;
+       unsigned tick;
+
+       /*
+        * The hash tables allows us to quickly find an entry by origin
+        * block.
+        */
+       struct hash_table table;
+       struct hash_table hotspot_table;
+
+       bool current_writeback_sentinels;
+       unsigned long next_writeback_period;
+
+       bool current_demote_sentinels;
+       unsigned long next_demote_period;
+
+       unsigned write_promote_level;
+       unsigned read_promote_level;
+
+       unsigned long next_hotspot_period;
+       unsigned long next_cache_period;
+};
+
+/*----------------------------------------------------------------*/
+
+static struct entry *get_sentinel(struct entry_alloc *ea, unsigned level, bool which)
+{
+       return get_entry(ea, which ? level : NR_CACHE_LEVELS + level);
+}
+
+static struct entry *writeback_sentinel(struct smq_policy *mq, unsigned level)
+{
+       return get_sentinel(&mq->writeback_sentinel_alloc, level, mq->current_writeback_sentinels);
+}
+
+static struct entry *demote_sentinel(struct smq_policy *mq, unsigned level)
+{
+       return get_sentinel(&mq->demote_sentinel_alloc, level, mq->current_demote_sentinels);
+}
+
+static void __update_writeback_sentinels(struct smq_policy *mq)
+{
+       unsigned level;
+       struct queue *q = &mq->dirty;
+       struct entry *sentinel;
+
+       for (level = 0; level < q->nr_levels; level++) {
+               sentinel = writeback_sentinel(mq, level);
+               q_del(q, sentinel);
+               q_push(q, sentinel);
+       }
+}
+
+static void __update_demote_sentinels(struct smq_policy *mq)
+{
+       unsigned level;
+       struct queue *q = &mq->clean;
+       struct entry *sentinel;
+
+       for (level = 0; level < q->nr_levels; level++) {
+               sentinel = demote_sentinel(mq, level);
+               q_del(q, sentinel);
+               q_push(q, sentinel);
+       }
+}
+
+static void update_sentinels(struct smq_policy *mq)
+{
+       if (time_after(jiffies, mq->next_writeback_period)) {
+               __update_writeback_sentinels(mq);
+               mq->next_writeback_period = jiffies + WRITEBACK_PERIOD;
+               mq->current_writeback_sentinels = !mq->current_writeback_sentinels;
+       }
+
+       if (time_after(jiffies, mq->next_demote_period)) {
+               __update_demote_sentinels(mq);
+               mq->next_demote_period = jiffies + DEMOTE_PERIOD;
+               mq->current_demote_sentinels = !mq->current_demote_sentinels;
+       }
+}
+
+static void __sentinels_init(struct smq_policy *mq)
+{
+       unsigned level;
+       struct entry *sentinel;
+
+       for (level = 0; level < NR_CACHE_LEVELS; level++) {
+               sentinel = writeback_sentinel(mq, level);
+               sentinel->level = level;
+               q_push(&mq->dirty, sentinel);
+
+               sentinel = demote_sentinel(mq, level);
+               sentinel->level = level;
+               q_push(&mq->clean, sentinel);
+       }
+}
+
+static void sentinels_init(struct smq_policy *mq)
+{
+       mq->next_writeback_period = jiffies + WRITEBACK_PERIOD;
+       mq->next_demote_period = jiffies + DEMOTE_PERIOD;
+
+       mq->current_writeback_sentinels = false;
+       mq->current_demote_sentinels = false;
+       __sentinels_init(mq);
+
+       mq->current_writeback_sentinels = !mq->current_writeback_sentinels;
+       mq->current_demote_sentinels = !mq->current_demote_sentinels;
+       __sentinels_init(mq);
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * These methods tie together the dirty queue, clean queue and hash table.
+ */
+static void push_new(struct smq_policy *mq, struct entry *e)
+{
+       struct queue *q = e->dirty ? &mq->dirty : &mq->clean;
+       h_insert(&mq->table, e);
+       q_push(q, e);
+}
+
+static void push(struct smq_policy *mq, struct entry *e)
+{
+       struct entry *sentinel;
+
+       h_insert(&mq->table, e);
+
+       /*
+        * Punch this into the queue just in front of the sentinel, to
+        * ensure it's cleaned straight away.
+        */
+       if (e->dirty) {
+               sentinel = writeback_sentinel(mq, e->level);
+               q_push_before(&mq->dirty, sentinel, e);
+       } else {
+               sentinel = demote_sentinel(mq, e->level);
+               q_push_before(&mq->clean, sentinel, e);
+       }
+}
+
+/*
+ * Removes an entry from cache.  Removes from the hash table.
+ */
+static void __del(struct smq_policy *mq, struct queue *q, struct entry *e)
+{
+       q_del(q, e);
+       h_remove(&mq->table, e);
+}
+
+static void del(struct smq_policy *mq, struct entry *e)
+{
+       __del(mq, e->dirty ? &mq->dirty : &mq->clean, e);
+}
+
+static struct entry *pop_old(struct smq_policy *mq, struct queue *q, unsigned max_level)
+{
+       struct entry *e = q_pop_old(q, max_level);
+       if (e)
+               h_remove(&mq->table, e);
+       return e;
+}
+
+static dm_cblock_t infer_cblock(struct smq_policy *mq, struct entry *e)
+{
+       return to_cblock(get_index(&mq->cache_alloc, e));
+}
+
+static void requeue(struct smq_policy *mq, struct entry *e)
+{
+       struct entry *sentinel;
+
+       if (!test_and_set_bit(from_cblock(infer_cblock(mq, e)), mq->cache_hit_bits)) {
+               if (e->dirty) {
+                       sentinel = writeback_sentinel(mq, e->level);
+                       q_requeue_before(&mq->dirty, sentinel, e, 1u);
+               } else {
+                       sentinel = demote_sentinel(mq, e->level);
+                       q_requeue_before(&mq->clean, sentinel, e, 1u);
+               }
+       }
+}
+
+static unsigned default_promote_level(struct smq_policy *mq)
+{
+       /*
+        * The promote level depends on the current performance of the
+        * cache.
+        *
+        * If the cache is performing badly, then we can't afford
+        * to promote much without causing performance to drop below that
+        * of the origin device.
+        *
+        * If the cache is performing well, then we don't need to promote
+        * much.  If it isn't broken, don't fix it.
+        *
+        * If the cache is middling then we promote more.
+        *
+        * This scheme reminds me of a graph of entropy vs probability of a
+        * binary variable.
+        */
+       static unsigned table[] = {1, 1, 1, 2, 4, 6, 7, 8, 7, 6, 4, 4, 3, 3, 2, 2, 1};
+
+       unsigned hits = mq->cache_stats.hits;
+       unsigned misses = mq->cache_stats.misses;
+       unsigned index = safe_div(hits << 4u, hits + misses);
+       return table[index];
+}
+
+static void update_promote_levels(struct smq_policy *mq)
+{
+       /*
+        * If there are unused cache entries then we want to be really
+        * eager to promote.
+        */
+       unsigned threshold_level = allocator_empty(&mq->cache_alloc) ?
+               default_promote_level(mq) : (NR_HOTSPOT_LEVELS / 2u);
+
+       /*
+        * If the hotspot queue is performing badly then we have little
+        * confidence that we know which blocks to promote.  So we cut down
+        * the amount of promotions.
+        */
+       switch (stats_assess(&mq->hotspot_stats)) {
+       case Q_POOR:
+               threshold_level /= 4u;
+               break;
+
+       case Q_FAIR:
+               threshold_level /= 2u;
+               break;
+
+       case Q_WELL:
+               break;
+       }
+
+       mq->read_promote_level = NR_HOTSPOT_LEVELS - threshold_level;
+       mq->write_promote_level = (NR_HOTSPOT_LEVELS - threshold_level) + 2u;
+}
+
+/*
+ * If the hotspot queue is performing badly, then we try and move entries
+ * around more quickly.
+ */
+static void update_level_jump(struct smq_policy *mq)
+{
+       switch (stats_assess(&mq->hotspot_stats)) {
+       case Q_POOR:
+               mq->hotspot_level_jump = 4u;
+               break;
+
+       case Q_FAIR:
+               mq->hotspot_level_jump = 2u;
+               break;
+
+       case Q_WELL:
+               mq->hotspot_level_jump = 1u;
+               break;
+       }
+}
+
+static void end_hotspot_period(struct smq_policy *mq)
+{
+       clear_bitset(mq->hotspot_hit_bits, mq->nr_hotspot_blocks);
+       update_promote_levels(mq);
+
+       if (time_after(jiffies, mq->next_hotspot_period)) {
+               update_level_jump(mq);
+               q_redistribute(&mq->hotspot);
+               stats_reset(&mq->hotspot_stats);
+               mq->next_hotspot_period = jiffies + HOTSPOT_UPDATE_PERIOD;
+       }
+}
+
+static void end_cache_period(struct smq_policy *mq)
+{
+       if (time_after(jiffies, mq->next_cache_period)) {
+               clear_bitset(mq->cache_hit_bits, from_cblock(mq->cache_size));
+
+               q_redistribute(&mq->dirty);
+               q_redistribute(&mq->clean);
+               stats_reset(&mq->cache_stats);
+
+               mq->next_cache_period = jiffies + CACHE_UPDATE_PERIOD;
+       }
+}
+
+static int demote_cblock(struct smq_policy *mq,
+                        struct policy_locker *locker,
+                        dm_oblock_t *oblock)
+{
+       struct entry *demoted = q_peek(&mq->clean, mq->clean.nr_levels, false);
+       if (!demoted)
+               /*
+                * We could get a block from mq->dirty, but that
+                * would add extra latency to the triggering bio as it
+                * waits for the writeback.  Better to not promote this
+                * time and hope there's a clean block next time this block
+                * is hit.
+                */
+               return -ENOSPC;
+
+       if (locker->fn(locker, demoted->oblock))
+               /*
+                * We couldn't lock this block.
+                */
+               return -EBUSY;
+
+       del(mq, demoted);
+       *oblock = demoted->oblock;
+       free_entry(&mq->cache_alloc, demoted);
+
+       return 0;
+}
+
+enum promote_result {
+       PROMOTE_NOT,
+       PROMOTE_TEMPORARY,
+       PROMOTE_PERMANENT
+};
+
+/*
+ * Converts a boolean into a promote result.
+ */
+static enum promote_result maybe_promote(bool promote)
+{
+       return promote ? PROMOTE_PERMANENT : PROMOTE_NOT;
+}
+
+static enum promote_result should_promote(struct smq_policy *mq, struct entry *hs_e, struct bio *bio,
+                                         bool fast_promote)
+{
+       if (bio_data_dir(bio) == WRITE) {
+               if (!allocator_empty(&mq->cache_alloc) && fast_promote)
+                       return PROMOTE_TEMPORARY;
+
+               else
+                       return maybe_promote(hs_e->level >= mq->write_promote_level);
+       } else
+               return maybe_promote(hs_e->level >= mq->read_promote_level);
+}
+
+static void insert_in_cache(struct smq_policy *mq, dm_oblock_t oblock,
+                           struct policy_locker *locker,
+                           struct policy_result *result, enum promote_result pr)
+{
+       int r;
+       struct entry *e;
+
+       if (allocator_empty(&mq->cache_alloc)) {
+               result->op = POLICY_REPLACE;
+               r = demote_cblock(mq, locker, &result->old_oblock);
+               if (r) {
+                       result->op = POLICY_MISS;
+                       return;
+               }
+
+       } else
+               result->op = POLICY_NEW;
+
+       e = alloc_entry(&mq->cache_alloc);
+       BUG_ON(!e);
+       e->oblock = oblock;
+
+       if (pr == PROMOTE_TEMPORARY)
+               push(mq, e);
+       else
+               push_new(mq, e);
+
+       result->cblock = infer_cblock(mq, e);
+}
+
+static dm_oblock_t to_hblock(struct smq_policy *mq, dm_oblock_t b)
+{
+       sector_t r = from_oblock(b);
+       (void) sector_div(r, mq->cache_blocks_per_hotspot_block);
+       return to_oblock(r);
+}
+
+static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b, struct bio *bio)
+{
+       unsigned hi;
+       dm_oblock_t hb = to_hblock(mq, b);
+       struct entry *e = h_lookup(&mq->hotspot_table, hb);
+
+       if (e) {
+               stats_level_accessed(&mq->hotspot_stats, e->level);
+
+               hi = get_index(&mq->hotspot_alloc, e);
+               q_requeue(&mq->hotspot, e,
+                         test_and_set_bit(hi, mq->hotspot_hit_bits) ?
+                         0u : mq->hotspot_level_jump);
+
+       } else {
+               stats_miss(&mq->hotspot_stats);
+
+               e = alloc_entry(&mq->hotspot_alloc);
+               if (!e) {
+                       e = q_pop(&mq->hotspot);
+                       if (e) {
+                               h_remove(&mq->hotspot_table, e);
+                               hi = get_index(&mq->hotspot_alloc, e);
+                               clear_bit(hi, mq->hotspot_hit_bits);
+                       }
+
+               }
+
+               if (e) {
+                       e->oblock = hb;
+                       q_push(&mq->hotspot, e);
+                       h_insert(&mq->hotspot_table, e);
+               }
+       }
+
+       return e;
+}
+
+/*
+ * Looks the oblock up in the hash table, then decides whether to put in
+ * pre_cache, or cache etc.
+ */
+static int map(struct smq_policy *mq, struct bio *bio, dm_oblock_t oblock,
+              bool can_migrate, bool fast_promote,
+              struct policy_locker *locker, struct policy_result *result)
+{
+       struct entry *e, *hs_e;
+       enum promote_result pr;
+
+       hs_e = update_hotspot_queue(mq, oblock, bio);
+
+       e = h_lookup(&mq->table, oblock);
+       if (e) {
+               stats_level_accessed(&mq->cache_stats, e->level);
+
+               requeue(mq, e);
+               result->op = POLICY_HIT;
+               result->cblock = infer_cblock(mq, e);
+
+       } else {
+               stats_miss(&mq->cache_stats);
+
+               pr = should_promote(mq, hs_e, bio, fast_promote);
+               if (pr == PROMOTE_NOT)
+                       result->op = POLICY_MISS;
+
+               else {
+                       if (!can_migrate) {
+                               result->op = POLICY_MISS;
+                               return -EWOULDBLOCK;
+                       }
+
+                       insert_in_cache(mq, oblock, locker, result, pr);
+               }
+       }
+
+       return 0;
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Public interface, via the policy struct.  See dm-cache-policy.h for a
+ * description of these.
+ */
+
+static struct smq_policy *to_smq_policy(struct dm_cache_policy *p)
+{
+       return container_of(p, struct smq_policy, policy);
+}
+
+static void smq_destroy(struct dm_cache_policy *p)
+{
+       struct smq_policy *mq = to_smq_policy(p);
+
+       h_exit(&mq->hotspot_table);
+       h_exit(&mq->table);
+       free_bitset(mq->hotspot_hit_bits);
+       free_bitset(mq->cache_hit_bits);
+       space_exit(&mq->es);
+       kfree(mq);
+}
+
+static void copy_tick(struct smq_policy *mq)
+{
+       unsigned long flags, tick;
+
+       spin_lock_irqsave(&mq->tick_lock, flags);
+       tick = mq->tick_protected;
+       if (tick != mq->tick) {
+               update_sentinels(mq);
+               end_hotspot_period(mq);
+               end_cache_period(mq);
+               mq->tick = tick;
+       }
+       spin_unlock_irqrestore(&mq->tick_lock, flags);
+}
+
+static bool maybe_lock(struct smq_policy *mq, bool can_block)
+{
+       if (can_block) {
+               mutex_lock(&mq->lock);
+               return true;
+       } else
+               return mutex_trylock(&mq->lock);
+}
+
+static int smq_map(struct dm_cache_policy *p, dm_oblock_t oblock,
+                  bool can_block, bool can_migrate, bool fast_promote,
+                  struct bio *bio, struct policy_locker *locker,
+                  struct policy_result *result)
+{
+       int r;
+       struct smq_policy *mq = to_smq_policy(p);
+
+       result->op = POLICY_MISS;
+
+       if (!maybe_lock(mq, can_block))
+               return -EWOULDBLOCK;
+
+       copy_tick(mq);
+       r = map(mq, bio, oblock, can_migrate, fast_promote, locker, result);
+       mutex_unlock(&mq->lock);
+
+       return r;
+}
+
+static int smq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock)
+{
+       int r;
+       struct smq_policy *mq = to_smq_policy(p);
+       struct entry *e;
+
+       if (!mutex_trylock(&mq->lock))
+               return -EWOULDBLOCK;
+
+       e = h_lookup(&mq->table, oblock);
+       if (e) {
+               *cblock = infer_cblock(mq, e);
+               r = 0;
+       } else
+               r = -ENOENT;
+
+       mutex_unlock(&mq->lock);
+
+       return r;
+}
+
+static void __smq_set_clear_dirty(struct smq_policy *mq, dm_oblock_t oblock, bool set)
+{
+       struct entry *e;
+
+       e = h_lookup(&mq->table, oblock);
+       BUG_ON(!e);
+
+       del(mq, e);
+       e->dirty = set;
+       push(mq, e);
+}
+
+static void smq_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
+{
+       struct smq_policy *mq = to_smq_policy(p);
+
+       mutex_lock(&mq->lock);
+       __smq_set_clear_dirty(mq, oblock, true);
+       mutex_unlock(&mq->lock);
+}
+
+static void smq_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
+{
+       struct smq_policy *mq = to_smq_policy(p);
+
+       mutex_lock(&mq->lock);
+       __smq_set_clear_dirty(mq, oblock, false);
+       mutex_unlock(&mq->lock);
+}
+
+static int smq_load_mapping(struct dm_cache_policy *p,
+                           dm_oblock_t oblock, dm_cblock_t cblock,
+                           uint32_t hint, bool hint_valid)
+{
+       struct smq_policy *mq = to_smq_policy(p);
+       struct entry *e;
+
+       e = alloc_particular_entry(&mq->cache_alloc, from_cblock(cblock));
+       e->oblock = oblock;
+       e->dirty = false;       /* this gets corrected in a minute */
+       e->level = hint_valid ? min(hint, NR_CACHE_LEVELS - 1) : 1;
+       push(mq, e);
+
+       return 0;
+}
+
+static int smq_save_hints(struct smq_policy *mq, struct queue *q,
+                         policy_walk_fn fn, void *context)
+{
+       int r;
+       unsigned level;
+       struct entry *e;
+
+       for (level = 0; level < q->nr_levels; level++)
+               for (e = l_head(q->es, q->qs + level); e; e = l_next(q->es, e)) {
+                       if (!e->sentinel) {
+                               r = fn(context, infer_cblock(mq, e),
+                                      e->oblock, e->level);
+                               if (r)
+                                       return r;
+                       }
+               }
+
+       return 0;
+}
+
+static int smq_walk_mappings(struct dm_cache_policy *p, policy_walk_fn fn,
+                            void *context)
+{
+       struct smq_policy *mq = to_smq_policy(p);
+       int r = 0;
+
+       mutex_lock(&mq->lock);
+
+       r = smq_save_hints(mq, &mq->clean, fn, context);
+       if (!r)
+               r = smq_save_hints(mq, &mq->dirty, fn, context);
+
+       mutex_unlock(&mq->lock);
+
+       return r;
+}
+
+static void __remove_mapping(struct smq_policy *mq, dm_oblock_t oblock)
+{
+       struct entry *e;
+
+       e = h_lookup(&mq->table, oblock);
+       BUG_ON(!e);
+
+       del(mq, e);
+       free_entry(&mq->cache_alloc, e);
+}
+
+static void smq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
+{
+       struct smq_policy *mq = to_smq_policy(p);
+
+       mutex_lock(&mq->lock);
+       __remove_mapping(mq, oblock);
+       mutex_unlock(&mq->lock);
+}
+
+static int __remove_cblock(struct smq_policy *mq, dm_cblock_t cblock)
+{
+       struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
+
+       if (!e || !e->allocated)
+               return -ENODATA;
+
+       del(mq, e);
+       free_entry(&mq->cache_alloc, e);
+
+       return 0;
+}
+
+static int smq_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock)
+{
+       int r;
+       struct smq_policy *mq = to_smq_policy(p);
+
+       mutex_lock(&mq->lock);
+       r = __remove_cblock(mq, cblock);
+       mutex_unlock(&mq->lock);
+
+       return r;
+}
+
+
+#define CLEAN_TARGET_CRITICAL 5u /* percent */
+
+static bool clean_target_met(struct smq_policy *mq, bool critical)
+{
+       if (critical) {
+               /*
+                * Cache entries may not be populated.  So we're cannot rely on the
+                * size of the clean queue.
+                */
+               unsigned nr_clean = from_cblock(mq->cache_size) - q_size(&mq->dirty);
+               unsigned target = from_cblock(mq->cache_size) * CLEAN_TARGET_CRITICAL / 100u;
+
+               return nr_clean >= target;
+       } else
+               return !q_size(&mq->dirty);
+}
+
+static int __smq_writeback_work(struct smq_policy *mq, dm_oblock_t *oblock,
+                               dm_cblock_t *cblock, bool critical_only)
+{
+       struct entry *e = NULL;
+       bool target_met = clean_target_met(mq, critical_only);
+
+       if (critical_only)
+               /*
+                * Always try and keep the bottom level clean.
+                */
+               e = pop_old(mq, &mq->dirty, target_met ? 1u : mq->dirty.nr_levels);
+
+       else
+               e = pop_old(mq, &mq->dirty, mq->dirty.nr_levels);
+
+       if (!e)
+               return -ENODATA;
+
+       *oblock = e->oblock;
+       *cblock = infer_cblock(mq, e);
+       e->dirty = false;
+       push_new(mq, e);
+
+       return 0;
+}
+
+static int smq_writeback_work(struct dm_cache_policy *p, dm_oblock_t *oblock,
+                             dm_cblock_t *cblock, bool critical_only)
+{
+       int r;
+       struct smq_policy *mq = to_smq_policy(p);
+
+       mutex_lock(&mq->lock);
+       r = __smq_writeback_work(mq, oblock, cblock, critical_only);
+       mutex_unlock(&mq->lock);
+
+       return r;
+}
+
+static void __force_mapping(struct smq_policy *mq,
+                           dm_oblock_t current_oblock, dm_oblock_t new_oblock)
+{
+       struct entry *e = h_lookup(&mq->table, current_oblock);
+
+       if (e) {
+               del(mq, e);
+               e->oblock = new_oblock;
+               e->dirty = true;
+               push(mq, e);
+       }
+}
+
+static void smq_force_mapping(struct dm_cache_policy *p,
+                             dm_oblock_t current_oblock, dm_oblock_t new_oblock)
+{
+       struct smq_policy *mq = to_smq_policy(p);
+
+       mutex_lock(&mq->lock);
+       __force_mapping(mq, current_oblock, new_oblock);
+       mutex_unlock(&mq->lock);
+}
+
+static dm_cblock_t smq_residency(struct dm_cache_policy *p)
+{
+       dm_cblock_t r;
+       struct smq_policy *mq = to_smq_policy(p);
+
+       mutex_lock(&mq->lock);
+       r = to_cblock(mq->cache_alloc.nr_allocated);
+       mutex_unlock(&mq->lock);
+
+       return r;
+}
+
+static void smq_tick(struct dm_cache_policy *p, bool can_block)
+{
+       struct smq_policy *mq = to_smq_policy(p);
+       unsigned long flags;
+
+       spin_lock_irqsave(&mq->tick_lock, flags);
+       mq->tick_protected++;
+       spin_unlock_irqrestore(&mq->tick_lock, flags);
+
+       if (can_block) {
+               mutex_lock(&mq->lock);
+               copy_tick(mq);
+               mutex_unlock(&mq->lock);
+       }
+}
+
+/* Init the policy plugin interface function pointers. */
+static void init_policy_functions(struct smq_policy *mq)
+{
+       mq->policy.destroy = smq_destroy;
+       mq->policy.map = smq_map;
+       mq->policy.lookup = smq_lookup;
+       mq->policy.set_dirty = smq_set_dirty;
+       mq->policy.clear_dirty = smq_clear_dirty;
+       mq->policy.load_mapping = smq_load_mapping;
+       mq->policy.walk_mappings = smq_walk_mappings;
+       mq->policy.remove_mapping = smq_remove_mapping;
+       mq->policy.remove_cblock = smq_remove_cblock;
+       mq->policy.writeback_work = smq_writeback_work;
+       mq->policy.force_mapping = smq_force_mapping;
+       mq->policy.residency = smq_residency;
+       mq->policy.tick = smq_tick;
+}
+
+static bool too_many_hotspot_blocks(sector_t origin_size,
+                                   sector_t hotspot_block_size,
+                                   unsigned nr_hotspot_blocks)
+{
+       return (hotspot_block_size * nr_hotspot_blocks) > origin_size;
+}
+
+static void calc_hotspot_params(sector_t origin_size,
+                               sector_t cache_block_size,
+                               unsigned nr_cache_blocks,
+                               sector_t *hotspot_block_size,
+                               unsigned *nr_hotspot_blocks)
+{
+       *hotspot_block_size = cache_block_size * 16u;
+       *nr_hotspot_blocks = max(nr_cache_blocks / 4u, 1024u);
+
+       while ((*hotspot_block_size > cache_block_size) &&
+              too_many_hotspot_blocks(origin_size, *hotspot_block_size, *nr_hotspot_blocks))
+               *hotspot_block_size /= 2u;
+}
+
+static struct dm_cache_policy *smq_create(dm_cblock_t cache_size,
+                                         sector_t origin_size,
+                                         sector_t cache_block_size)
+{
+       unsigned i;
+       unsigned nr_sentinels_per_queue = 2u * NR_CACHE_LEVELS;
+       unsigned total_sentinels = 2u * nr_sentinels_per_queue;
+       struct smq_policy *mq = kzalloc(sizeof(*mq), GFP_KERNEL);
+
+       if (!mq)
+               return NULL;
+
+       init_policy_functions(mq);
+       mq->cache_size = cache_size;
+       mq->cache_block_size = cache_block_size;
+
+       calc_hotspot_params(origin_size, cache_block_size, from_cblock(cache_size),
+                           &mq->hotspot_block_size, &mq->nr_hotspot_blocks);
+
+       mq->cache_blocks_per_hotspot_block = div64_u64(mq->hotspot_block_size, mq->cache_block_size);
+       mq->hotspot_level_jump = 1u;
+       if (space_init(&mq->es, total_sentinels + mq->nr_hotspot_blocks + from_cblock(cache_size))) {
+               DMERR("couldn't initialize entry space");
+               goto bad_pool_init;
+       }
+
+       init_allocator(&mq->writeback_sentinel_alloc, &mq->es, 0, nr_sentinels_per_queue);
+        for (i = 0; i < nr_sentinels_per_queue; i++)
+               get_entry(&mq->writeback_sentinel_alloc, i)->sentinel = true;
+
+       init_allocator(&mq->demote_sentinel_alloc, &mq->es, nr_sentinels_per_queue, total_sentinels);
+        for (i = 0; i < nr_sentinels_per_queue; i++)
+               get_entry(&mq->demote_sentinel_alloc, i)->sentinel = true;
+
+       init_allocator(&mq->hotspot_alloc, &mq->es, total_sentinels,
+                      total_sentinels + mq->nr_hotspot_blocks);
+
+       init_allocator(&mq->cache_alloc, &mq->es,
+                      total_sentinels + mq->nr_hotspot_blocks,
+                      total_sentinels + mq->nr_hotspot_blocks + from_cblock(cache_size));
+
+       mq->hotspot_hit_bits = alloc_bitset(mq->nr_hotspot_blocks);
+       if (!mq->hotspot_hit_bits) {
+               DMERR("couldn't allocate hotspot hit bitset");
+               goto bad_hotspot_hit_bits;
+       }
+       clear_bitset(mq->hotspot_hit_bits, mq->nr_hotspot_blocks);
+
+       if (from_cblock(cache_size)) {
+               mq->cache_hit_bits = alloc_bitset(from_cblock(cache_size));
+               if (!mq->cache_hit_bits && mq->cache_hit_bits) {
+                       DMERR("couldn't allocate cache hit bitset");
+                       goto bad_cache_hit_bits;
+               }
+               clear_bitset(mq->cache_hit_bits, from_cblock(mq->cache_size));
+       } else
+               mq->cache_hit_bits = NULL;
+
+       mq->tick_protected = 0;
+       mq->tick = 0;
+       mutex_init(&mq->lock);
+       spin_lock_init(&mq->tick_lock);
+
+       q_init(&mq->hotspot, &mq->es, NR_HOTSPOT_LEVELS);
+       mq->hotspot.nr_top_levels = 8;
+       mq->hotspot.nr_in_top_levels = min(mq->nr_hotspot_blocks / NR_HOTSPOT_LEVELS,
+                                          from_cblock(mq->cache_size) / mq->cache_blocks_per_hotspot_block);
+
+       q_init(&mq->clean, &mq->es, NR_CACHE_LEVELS);
+       q_init(&mq->dirty, &mq->es, NR_CACHE_LEVELS);
+
+       stats_init(&mq->hotspot_stats, NR_HOTSPOT_LEVELS);
+       stats_init(&mq->cache_stats, NR_CACHE_LEVELS);
+
+       if (h_init(&mq->table, &mq->es, from_cblock(cache_size)))
+               goto bad_alloc_table;
+
+       if (h_init(&mq->hotspot_table, &mq->es, mq->nr_hotspot_blocks))
+               goto bad_alloc_hotspot_table;
+
+       sentinels_init(mq);
+       mq->write_promote_level = mq->read_promote_level = NR_HOTSPOT_LEVELS;
+
+       mq->next_hotspot_period = jiffies;
+       mq->next_cache_period = jiffies;
+
+       return &mq->policy;
+
+bad_alloc_hotspot_table:
+       h_exit(&mq->table);
+bad_alloc_table:
+       free_bitset(mq->cache_hit_bits);
+bad_cache_hit_bits:
+       free_bitset(mq->hotspot_hit_bits);
+bad_hotspot_hit_bits:
+       space_exit(&mq->es);
+bad_pool_init:
+       kfree(mq);
+
+       return NULL;
+}
+
+/*----------------------------------------------------------------*/
+
+static struct dm_cache_policy_type smq_policy_type = {
+       .name = "smq",
+       .version = {1, 0, 0},
+       .hint_size = 4,
+       .owner = THIS_MODULE,
+       .create = smq_create
+};
+
+static struct dm_cache_policy_type default_policy_type = {
+       .name = "default",
+       .version = {1, 0, 0},
+       .hint_size = 4,
+       .owner = THIS_MODULE,
+       .create = smq_create,
+       .real = &smq_policy_type
+};
+
+static int __init smq_init(void)
+{
+       int r;
+
+       r = dm_cache_policy_register(&smq_policy_type);
+       if (r) {
+               DMERR("register failed %d", r);
+               return -ENOMEM;
+       }
+
+       r = dm_cache_policy_register(&default_policy_type);
+       if (r) {
+               DMERR("register failed (as default) %d", r);
+               dm_cache_policy_unregister(&smq_policy_type);
+               return -ENOMEM;
+       }
+
+       return 0;
+}
+
+static void __exit smq_exit(void)
+{
+       dm_cache_policy_unregister(&smq_policy_type);
+       dm_cache_policy_unregister(&default_policy_type);
+}
+
+module_init(smq_init);
+module_exit(smq_exit);
+
+MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("smq cache policy");
index f50fe36..05db56e 100644 (file)
@@ -70,6 +70,18 @@ enum policy_operation {
 };
 
 /*
+ * When issuing a POLICY_REPLACE the policy needs to make a callback to
+ * lock the block being demoted.  This doesn't need to occur during a
+ * writeback operation since the block remains in the cache.
+ */
+struct policy_locker;
+typedef int (*policy_lock_fn)(struct policy_locker *l, dm_oblock_t oblock);
+
+struct policy_locker {
+       policy_lock_fn fn;
+};
+
+/*
  * This is the instruction passed back to the core target.
  */
 struct policy_result {
@@ -122,7 +134,8 @@ struct dm_cache_policy {
         */
        int (*map)(struct dm_cache_policy *p, dm_oblock_t oblock,
                   bool can_block, bool can_migrate, bool discarded_oblock,
-                  struct bio *bio, struct policy_result *result);
+                  struct bio *bio, struct policy_locker *locker,
+                  struct policy_result *result);
 
        /*
         * Sometimes we want to see if a block is in the cache, without
@@ -165,7 +178,9 @@ struct dm_cache_policy {
        int (*remove_cblock)(struct dm_cache_policy *p, dm_cblock_t cblock);
 
        /*
-        * Provide a dirty block to be written back by the core target.
+        * Provide a dirty block to be written back by the core target.  If
+        * critical_only is set then the policy should only provide work if
+        * it urgently needs it.
         *
         * Returns:
         *
@@ -173,7 +188,8 @@ struct dm_cache_policy {
         *
         * -ENODATA: no dirty blocks available
         */
-       int (*writeback_work)(struct dm_cache_policy *p, dm_oblock_t *oblock, dm_cblock_t *cblock);
+       int (*writeback_work)(struct dm_cache_policy *p, dm_oblock_t *oblock, dm_cblock_t *cblock,
+                             bool critical_only);
 
        /*
         * How full is the cache?
@@ -184,16 +200,16 @@ struct dm_cache_policy {
         * Because of where we sit in the block layer, we can be asked to
         * map a lot of little bios that are all in the same block (no
         * queue merging has occurred).  To stop the policy being fooled by
-        * these the core target sends regular tick() calls to the policy.
+        * these, the core target sends regular tick() calls to the policy.
         * The policy should only count an entry as hit once per tick.
         */
-       void (*tick)(struct dm_cache_policy *p);
+       void (*tick)(struct dm_cache_policy *p, bool can_block);
 
        /*
         * Configuration.
         */
-       int (*emit_config_values)(struct dm_cache_policy *p,
-                                 char *result, unsigned maxlen);
+       int (*emit_config_values)(struct dm_cache_policy *p, char *result,
+                                 unsigned maxlen, ssize_t *sz_ptr);
        int (*set_config_value)(struct dm_cache_policy *p,
                                const char *key, const char *value);
 
index 41b2594..1b4e175 100644 (file)
@@ -25,44 +25,93 @@ DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
 
 /*----------------------------------------------------------------*/
 
-/*
- * Glossary:
- *
- * oblock: index of an origin block
- * cblock: index of a cache block
- * promotion: movement of a block from origin to cache
- * demotion: movement of a block from cache to origin
- * migration: movement of a block between the origin and cache device,
- *           either direction
- */
+#define IOT_RESOLUTION 4
 
-/*----------------------------------------------------------------*/
+struct io_tracker {
+       spinlock_t lock;
 
-static size_t bitset_size_in_bytes(unsigned nr_entries)
+       /*
+        * Sectors of in-flight IO.
+        */
+       sector_t in_flight;
+
+       /*
+        * The time, in jiffies, when this device became idle (if it is
+        * indeed idle).
+        */
+       unsigned long idle_time;
+       unsigned long last_update_time;
+};
+
+static void iot_init(struct io_tracker *iot)
 {
-       return sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG);
+       spin_lock_init(&iot->lock);
+       iot->in_flight = 0ul;
+       iot->idle_time = 0ul;
+       iot->last_update_time = jiffies;
 }
 
-static unsigned long *alloc_bitset(unsigned nr_entries)
+static bool __iot_idle_for(struct io_tracker *iot, unsigned long jifs)
 {
-       size_t s = bitset_size_in_bytes(nr_entries);
-       return vzalloc(s);
+       if (iot->in_flight)
+               return false;
+
+       return time_after(jiffies, iot->idle_time + jifs);
 }
 
-static void clear_bitset(void *bitset, unsigned nr_entries)
+static bool iot_idle_for(struct io_tracker *iot, unsigned long jifs)
 {
-       size_t s = bitset_size_in_bytes(nr_entries);
-       memset(bitset, 0, s);
+       bool r;
+       unsigned long flags;
+
+       spin_lock_irqsave(&iot->lock, flags);
+       r = __iot_idle_for(iot, jifs);
+       spin_unlock_irqrestore(&iot->lock, flags);
+
+       return r;
 }
 
-static void free_bitset(unsigned long *bits)
+static void iot_io_begin(struct io_tracker *iot, sector_t len)
 {
-       vfree(bits);
+       unsigned long flags;
+
+       spin_lock_irqsave(&iot->lock, flags);
+       iot->in_flight += len;
+       spin_unlock_irqrestore(&iot->lock, flags);
+}
+
+static void __iot_io_end(struct io_tracker *iot, sector_t len)
+{
+       iot->in_flight -= len;
+       if (!iot->in_flight)
+               iot->idle_time = jiffies;
+}
+
+static void iot_io_end(struct io_tracker *iot, sector_t len)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&iot->lock, flags);
+       __iot_io_end(iot, len);
+       spin_unlock_irqrestore(&iot->lock, flags);
 }
 
 /*----------------------------------------------------------------*/
 
 /*
+ * Glossary:
+ *
+ * oblock: index of an origin block
+ * cblock: index of a cache block
+ * promotion: movement of a block from origin to cache
+ * demotion: movement of a block from cache to origin
+ * migration: movement of a block between the origin and cache device,
+ *           either direction
+ */
+
+/*----------------------------------------------------------------*/
+
+/*
  * There are a couple of places where we let a bio run, but want to do some
  * work before calling its endio function.  We do this by temporarily
  * changing the endio fn.
@@ -101,12 +150,10 @@ static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio)
 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT)
 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
 
-/*
- * FIXME: the cache is read/write for the time being.
- */
 enum cache_metadata_mode {
        CM_WRITE,               /* metadata may be changed */
        CM_READ_ONLY,           /* metadata may not be changed */
+       CM_FAIL
 };
 
 enum cache_io_mode {
@@ -208,6 +255,7 @@ struct cache {
        int sectors_per_block_shift;
 
        spinlock_t lock;
+       struct list_head deferred_cells;
        struct bio_list deferred_bios;
        struct bio_list deferred_flush_bios;
        struct bio_list deferred_writethrough_bios;
@@ -282,6 +330,8 @@ struct cache {
         */
        spinlock_t invalidation_lock;
        struct list_head invalidation_requests;
+
+       struct io_tracker origin_tracker;
 };
 
 struct per_bio_data {
@@ -289,6 +339,7 @@ struct per_bio_data {
        unsigned req_nr:2;
        struct dm_deferred_entry *all_io_entry;
        struct dm_hook_info hook_info;
+       sector_t len;
 
        /*
         * writethrough fields.  These MUST remain at the end of this
@@ -332,6 +383,8 @@ struct prealloc {
        struct dm_bio_prison_cell *cell2;
 };
 
+static enum cache_metadata_mode get_cache_mode(struct cache *cache);
+
 static void wake_worker(struct cache *cache)
 {
        queue_work(cache->wq, &cache->worker);
@@ -365,10 +418,13 @@ static struct dm_cache_migration *alloc_migration(struct cache *cache)
 
 static void free_migration(struct dm_cache_migration *mg)
 {
-       if (atomic_dec_and_test(&mg->cache->nr_allocated_migrations))
-               wake_up(&mg->cache->migration_wait);
+       struct cache *cache = mg->cache;
+
+       if (atomic_dec_and_test(&cache->nr_allocated_migrations))
+               wake_up(&cache->migration_wait);
 
-       mempool_free(mg, mg->cache->migration_pool);
+       mempool_free(mg, cache->migration_pool);
+       wake_worker(cache);
 }
 
 static int prealloc_data_structs(struct cache *cache, struct prealloc *p)
@@ -643,6 +699,9 @@ static void save_stats(struct cache *cache)
 {
        struct dm_cache_statistics stats;
 
+       if (get_cache_mode(cache) >= CM_READ_ONLY)
+               return;
+
        stats.read_hits = atomic_read(&cache->stats.read_hit);
        stats.read_misses = atomic_read(&cache->stats.read_miss);
        stats.write_hits = atomic_read(&cache->stats.write_hit);
@@ -695,6 +754,7 @@ static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size)
        pb->tick = false;
        pb->req_nr = dm_bio_get_target_bio_nr(bio);
        pb->all_io_entry = NULL;
+       pb->len = 0;
 
        return pb;
 }
@@ -792,12 +852,43 @@ static void inc_ds(struct cache *cache, struct bio *bio,
        pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
 }
 
+static bool accountable_bio(struct cache *cache, struct bio *bio)
+{
+       return ((bio->bi_bdev == cache->origin_dev->bdev) &&
+               !(bio->bi_rw & REQ_DISCARD));
+}
+
+static void accounted_begin(struct cache *cache, struct bio *bio)
+{
+       size_t pb_data_size = get_per_bio_data_size(cache);
+       struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
+
+       if (accountable_bio(cache, bio)) {
+               pb->len = bio_sectors(bio);
+               iot_io_begin(&cache->origin_tracker, pb->len);
+       }
+}
+
+static void accounted_complete(struct cache *cache, struct bio *bio)
+{
+       size_t pb_data_size = get_per_bio_data_size(cache);
+       struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
+
+       iot_io_end(&cache->origin_tracker, pb->len);
+}
+
+static void accounted_request(struct cache *cache, struct bio *bio)
+{
+       accounted_begin(cache, bio);
+       generic_make_request(bio);
+}
+
 static void issue(struct cache *cache, struct bio *bio)
 {
        unsigned long flags;
 
        if (!bio_triggers_commit(cache, bio)) {
-               generic_make_request(bio);
+               accounted_request(cache, bio);
                return;
        }
 
@@ -870,6 +961,94 @@ static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio,
 }
 
 /*----------------------------------------------------------------
+ * Failure modes
+ *--------------------------------------------------------------*/
+static enum cache_metadata_mode get_cache_mode(struct cache *cache)
+{
+       return cache->features.mode;
+}
+
+static const char *cache_device_name(struct cache *cache)
+{
+       return dm_device_name(dm_table_get_md(cache->ti->table));
+}
+
+static void notify_mode_switch(struct cache *cache, enum cache_metadata_mode mode)
+{
+       const char *descs[] = {
+               "write",
+               "read-only",
+               "fail"
+       };
+
+       dm_table_event(cache->ti->table);
+       DMINFO("%s: switching cache to %s mode",
+              cache_device_name(cache), descs[(int)mode]);
+}
+
+static void set_cache_mode(struct cache *cache, enum cache_metadata_mode new_mode)
+{
+       bool needs_check = dm_cache_metadata_needs_check(cache->cmd);
+       enum cache_metadata_mode old_mode = get_cache_mode(cache);
+
+       if (new_mode == CM_WRITE && needs_check) {
+               DMERR("%s: unable to switch cache to write mode until repaired.",
+                     cache_device_name(cache));
+               if (old_mode != new_mode)
+                       new_mode = old_mode;
+               else
+                       new_mode = CM_READ_ONLY;
+       }
+
+       /* Never move out of fail mode */
+       if (old_mode == CM_FAIL)
+               new_mode = CM_FAIL;
+
+       switch (new_mode) {
+       case CM_FAIL:
+       case CM_READ_ONLY:
+               dm_cache_metadata_set_read_only(cache->cmd);
+               break;
+
+       case CM_WRITE:
+               dm_cache_metadata_set_read_write(cache->cmd);
+               break;
+       }
+
+       cache->features.mode = new_mode;
+
+       if (new_mode != old_mode)
+               notify_mode_switch(cache, new_mode);
+}
+
+static void abort_transaction(struct cache *cache)
+{
+       const char *dev_name = cache_device_name(cache);
+
+       if (get_cache_mode(cache) >= CM_READ_ONLY)
+               return;
+
+       if (dm_cache_metadata_set_needs_check(cache->cmd)) {
+               DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);
+               set_cache_mode(cache, CM_FAIL);
+       }
+
+       DMERR_LIMIT("%s: aborting current metadata transaction", dev_name);
+       if (dm_cache_metadata_abort(cache->cmd)) {
+               DMERR("%s: failed to abort metadata transaction", dev_name);
+               set_cache_mode(cache, CM_FAIL);
+       }
+}
+
+static void metadata_operation_failed(struct cache *cache, const char *op, int r)
+{
+       DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
+                   cache_device_name(cache), op, r);
+       abort_transaction(cache);
+       set_cache_mode(cache, CM_READ_ONLY);
+}
+
+/*----------------------------------------------------------------
  * Migration processing
  *
  * Migration covers moving data from the origin device to the cache, or
@@ -885,26 +1064,63 @@ static void dec_io_migrations(struct cache *cache)
        atomic_dec(&cache->nr_io_migrations);
 }
 
-static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
-                        bool holder)
+static void __cell_release(struct cache *cache, struct dm_bio_prison_cell *cell,
+                          bool holder, struct bio_list *bios)
 {
        (holder ? dm_cell_release : dm_cell_release_no_holder)
-               (cache->prison, cell, &cache->deferred_bios);
+               (cache->prison, cell, bios);
        free_prison_cell(cache, cell);
 }
 
-static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
-                      bool holder)
+static bool discard_or_flush(struct bio *bio)
+{
+       return bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD);
+}
+
+static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell)
+{
+       if (discard_or_flush(cell->holder))
+               /*
+                * We have to handle these bios
+                * individually.
+                */
+               __cell_release(cache, cell, true, &cache->deferred_bios);
+
+       else
+               list_add_tail(&cell->user_list, &cache->deferred_cells);
+}
+
+static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, bool holder)
 {
        unsigned long flags;
 
+       if (!holder && dm_cell_promote_or_release(cache->prison, cell)) {
+               /*
+                * There was no prisoner to promote to holder, the
+                * cell has been released.
+                */
+               free_prison_cell(cache, cell);
+               return;
+       }
+
        spin_lock_irqsave(&cache->lock, flags);
-       __cell_defer(cache, cell, holder);
+       __cell_defer(cache, cell);
        spin_unlock_irqrestore(&cache->lock, flags);
 
        wake_worker(cache);
 }
 
+static void cell_error_with_code(struct cache *cache, struct dm_bio_prison_cell *cell, int err)
+{
+       dm_cell_error(cache->prison, cell, err);
+       dm_bio_prison_free_cell(cache->prison, cell);
+}
+
+static void cell_requeue(struct cache *cache, struct dm_bio_prison_cell *cell)
+{
+       cell_error_with_code(cache, cell, DM_ENDIO_REQUEUE);
+}
+
 static void free_io_migration(struct dm_cache_migration *mg)
 {
        dec_io_migrations(mg->cache);
@@ -914,21 +1130,22 @@ static void free_io_migration(struct dm_cache_migration *mg)
 static void migration_failure(struct dm_cache_migration *mg)
 {
        struct cache *cache = mg->cache;
+       const char *dev_name = cache_device_name(cache);
 
        if (mg->writeback) {
-               DMWARN_LIMIT("writeback failed; couldn't copy block");
+               DMERR_LIMIT("%s: writeback failed; couldn't copy block", dev_name);
                set_dirty(cache, mg->old_oblock, mg->cblock);
                cell_defer(cache, mg->old_ocell, false);
 
        } else if (mg->demote) {
-               DMWARN_LIMIT("demotion failed; couldn't copy block");
+               DMERR_LIMIT("%s: demotion failed; couldn't copy block", dev_name);
                policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock);
 
                cell_defer(cache, mg->old_ocell, mg->promote ? false : true);
                if (mg->promote)
                        cell_defer(cache, mg->new_ocell, true);
        } else {
-               DMWARN_LIMIT("promotion failed; couldn't copy block");
+               DMERR_LIMIT("%s: promotion failed; couldn't copy block", dev_name);
                policy_remove_mapping(cache->policy, mg->new_oblock);
                cell_defer(cache, mg->new_ocell, true);
        }
@@ -938,6 +1155,7 @@ static void migration_failure(struct dm_cache_migration *mg)
 
 static void migration_success_pre_commit(struct dm_cache_migration *mg)
 {
+       int r;
        unsigned long flags;
        struct cache *cache = mg->cache;
 
@@ -948,8 +1166,11 @@ static void migration_success_pre_commit(struct dm_cache_migration *mg)
                return;
 
        } else if (mg->demote) {
-               if (dm_cache_remove_mapping(cache->cmd, mg->cblock)) {
-                       DMWARN_LIMIT("demotion failed; couldn't update on disk metadata");
+               r = dm_cache_remove_mapping(cache->cmd, mg->cblock);
+               if (r) {
+                       DMERR_LIMIT("%s: demotion failed; couldn't update on disk metadata",
+                                   cache_device_name(cache));
+                       metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
                        policy_force_mapping(cache->policy, mg->new_oblock,
                                             mg->old_oblock);
                        if (mg->promote)
@@ -958,8 +1179,11 @@ static void migration_success_pre_commit(struct dm_cache_migration *mg)
                        return;
                }
        } else {
-               if (dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock)) {
-                       DMWARN_LIMIT("promotion failed; couldn't update on disk metadata");
+               r = dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock);
+               if (r) {
+                       DMERR_LIMIT("%s: promotion failed; couldn't update on disk metadata",
+                                   cache_device_name(cache));
+                       metadata_operation_failed(cache, "dm_cache_insert_mapping", r);
                        policy_remove_mapping(cache->policy, mg->new_oblock);
                        free_io_migration(mg);
                        return;
@@ -978,7 +1202,8 @@ static void migration_success_post_commit(struct dm_cache_migration *mg)
        struct cache *cache = mg->cache;
 
        if (mg->writeback) {
-               DMWARN("writeback unexpectedly triggered commit");
+               DMWARN_LIMIT("%s: writeback unexpectedly triggered commit",
+                            cache_device_name(cache));
                return;
 
        } else if (mg->demote) {
@@ -1054,7 +1279,7 @@ static void issue_copy(struct dm_cache_migration *mg)
        }
 
        if (r < 0) {
-               DMERR_LIMIT("issuing migration failed");
+               DMERR_LIMIT("%s: issuing migration failed", cache_device_name(cache));
                migration_failure(mg);
        }
 }
@@ -1093,7 +1318,7 @@ static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio)
         * No need to inc_ds() here, since the cell will be held for the
         * duration of the io.
         */
-       generic_make_request(bio);
+       accounted_request(mg->cache, bio);
 }
 
 static bool bio_writes_complete_block(struct cache *cache, struct bio *bio)
@@ -1439,32 +1664,154 @@ static void inc_miss_counter(struct cache *cache, struct bio *bio)
                   &cache->stats.read_miss : &cache->stats.write_miss);
 }
 
-static void process_bio(struct cache *cache, struct prealloc *structs,
-                       struct bio *bio)
+/*----------------------------------------------------------------*/
+
+struct inc_detail {
+       struct cache *cache;
+       struct bio_list bios_for_issue;
+       struct bio_list unhandled_bios;
+       bool any_writes;
+};
+
+static void inc_fn(void *context, struct dm_bio_prison_cell *cell)
+{
+       struct bio *bio;
+       struct inc_detail *detail = context;
+       struct cache *cache = detail->cache;
+
+       inc_ds(cache, cell->holder, cell);
+       if (bio_data_dir(cell->holder) == WRITE)
+               detail->any_writes = true;
+
+       while ((bio = bio_list_pop(&cell->bios))) {
+               if (discard_or_flush(bio)) {
+                       bio_list_add(&detail->unhandled_bios, bio);
+                       continue;
+               }
+
+               if (bio_data_dir(bio) == WRITE)
+                       detail->any_writes = true;
+
+               bio_list_add(&detail->bios_for_issue, bio);
+               inc_ds(cache, bio, cell);
+       }
+}
+
+// FIXME: refactor these two
+static void remap_cell_to_origin_clear_discard(struct cache *cache,
+                                              struct dm_bio_prison_cell *cell,
+                                              dm_oblock_t oblock, bool issue_holder)
+{
+       struct bio *bio;
+       unsigned long flags;
+       struct inc_detail detail;
+
+       detail.cache = cache;
+       bio_list_init(&detail.bios_for_issue);
+       bio_list_init(&detail.unhandled_bios);
+       detail.any_writes = false;
+
+       spin_lock_irqsave(&cache->lock, flags);
+       dm_cell_visit_release(cache->prison, inc_fn, &detail, cell);
+       bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios);
+       spin_unlock_irqrestore(&cache->lock, flags);
+
+       remap_to_origin(cache, cell->holder);
+       if (issue_holder)
+               issue(cache, cell->holder);
+       else
+               accounted_begin(cache, cell->holder);
+
+       if (detail.any_writes)
+               clear_discard(cache, oblock_to_dblock(cache, oblock));
+
+       while ((bio = bio_list_pop(&detail.bios_for_issue))) {
+               remap_to_origin(cache, bio);
+               issue(cache, bio);
+       }
+}
+
+static void remap_cell_to_cache_dirty(struct cache *cache, struct dm_bio_prison_cell *cell,
+                                     dm_oblock_t oblock, dm_cblock_t cblock, bool issue_holder)
+{
+       struct bio *bio;
+       unsigned long flags;
+       struct inc_detail detail;
+
+       detail.cache = cache;
+       bio_list_init(&detail.bios_for_issue);
+       bio_list_init(&detail.unhandled_bios);
+       detail.any_writes = false;
+
+       spin_lock_irqsave(&cache->lock, flags);
+       dm_cell_visit_release(cache->prison, inc_fn, &detail, cell);
+       bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios);
+       spin_unlock_irqrestore(&cache->lock, flags);
+
+       remap_to_cache(cache, cell->holder, cblock);
+       if (issue_holder)
+               issue(cache, cell->holder);
+       else
+               accounted_begin(cache, cell->holder);
+
+       if (detail.any_writes) {
+               set_dirty(cache, oblock, cblock);
+               clear_discard(cache, oblock_to_dblock(cache, oblock));
+       }
+
+       while ((bio = bio_list_pop(&detail.bios_for_issue))) {
+               remap_to_cache(cache, bio, cblock);
+               issue(cache, bio);
+       }
+}
+
+/*----------------------------------------------------------------*/
+
+struct old_oblock_lock {
+       struct policy_locker locker;
+       struct cache *cache;
+       struct prealloc *structs;
+       struct dm_bio_prison_cell *cell;
+};
+
+static int null_locker(struct policy_locker *locker, dm_oblock_t b)
+{
+       /* This should never be called */
+       BUG();
+       return 0;
+}
+
+static int cell_locker(struct policy_locker *locker, dm_oblock_t b)
+{
+       struct old_oblock_lock *l = container_of(locker, struct old_oblock_lock, locker);
+       struct dm_bio_prison_cell *cell_prealloc = prealloc_get_cell(l->structs);
+
+       return bio_detain(l->cache, b, NULL, cell_prealloc,
+                         (cell_free_fn) prealloc_put_cell,
+                         l->structs, &l->cell);
+}
+
+static void process_cell(struct cache *cache, struct prealloc *structs,
+                        struct dm_bio_prison_cell *new_ocell)
 {
        int r;
        bool release_cell = true;
+       struct bio *bio = new_ocell->holder;
        dm_oblock_t block = get_bio_block(cache, bio);
-       struct dm_bio_prison_cell *cell_prealloc, *old_ocell, *new_ocell;
        struct policy_result lookup_result;
        bool passthrough = passthrough_mode(&cache->features);
-       bool discarded_block, can_migrate;
-
-       /*
-        * Check to see if that block is currently migrating.
-        */
-       cell_prealloc = prealloc_get_cell(structs);
-       r = bio_detain(cache, block, bio, cell_prealloc,
-                      (cell_free_fn) prealloc_put_cell,
-                      structs, &new_ocell);
-       if (r > 0)
-               return;
+       bool fast_promotion, can_migrate;
+       struct old_oblock_lock ool;
 
-       discarded_block = is_discarded_oblock(cache, block);
-       can_migrate = !passthrough && (discarded_block || spare_migration_bandwidth(cache));
+       fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio);
+       can_migrate = !passthrough && (fast_promotion || spare_migration_bandwidth(cache));
 
-       r = policy_map(cache->policy, block, true, can_migrate, discarded_block,
-                      bio, &lookup_result);
+       ool.locker.fn = cell_locker;
+       ool.cache = cache;
+       ool.structs = structs;
+       ool.cell = NULL;
+       r = policy_map(cache->policy, block, true, can_migrate, fast_promotion,
+                      bio, &ool.locker, &lookup_result);
 
        if (r == -EWOULDBLOCK)
                /* migration has been denied */
@@ -1500,9 +1847,9 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
                                remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
                                inc_and_issue(cache, bio, new_ocell);
 
-                       } else  {
-                               remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
-                               inc_and_issue(cache, bio, new_ocell);
+                       } else {
+                               remap_cell_to_cache_dirty(cache, new_ocell, block, lookup_result.cblock, true);
+                               release_cell = false;
                        }
                }
 
@@ -1510,8 +1857,8 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
 
        case POLICY_MISS:
                inc_miss_counter(cache, bio);
-               remap_to_origin_clear_discard(cache, bio, block);
-               inc_and_issue(cache, bio, new_ocell);
+               remap_cell_to_origin_clear_discard(cache, new_ocell, block, true);
+               release_cell = false;
                break;
 
        case POLICY_NEW:
@@ -1521,32 +1868,17 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
                break;
 
        case POLICY_REPLACE:
-               cell_prealloc = prealloc_get_cell(structs);
-               r = bio_detain(cache, lookup_result.old_oblock, bio, cell_prealloc,
-                              (cell_free_fn) prealloc_put_cell,
-                              structs, &old_ocell);
-               if (r > 0) {
-                       /*
-                        * We have to be careful to avoid lock inversion of
-                        * the cells.  So we back off, and wait for the
-                        * old_ocell to become free.
-                        */
-                       policy_force_mapping(cache->policy, block,
-                                            lookup_result.old_oblock);
-                       atomic_inc(&cache->stats.cache_cell_clash);
-                       break;
-               }
                atomic_inc(&cache->stats.demotion);
                atomic_inc(&cache->stats.promotion);
-
                demote_then_promote(cache, structs, lookup_result.old_oblock,
                                    block, lookup_result.cblock,
-                                   old_ocell, new_ocell);
+                                   ool.cell, new_ocell);
                release_cell = false;
                break;
 
        default:
-               DMERR_LIMIT("%s: erroring bio, unknown policy op: %u", __func__,
+               DMERR_LIMIT("%s: %s: erroring bio, unknown policy op: %u",
+                           cache_device_name(cache), __func__,
                            (unsigned) lookup_result.op);
                bio_io_error(bio);
        }
@@ -1555,10 +1887,48 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
                cell_defer(cache, new_ocell, false);
 }
 
+static void process_bio(struct cache *cache, struct prealloc *structs,
+                       struct bio *bio)
+{
+       int r;
+       dm_oblock_t block = get_bio_block(cache, bio);
+       struct dm_bio_prison_cell *cell_prealloc, *new_ocell;
+
+       /*
+        * Check to see if that block is currently migrating.
+        */
+       cell_prealloc = prealloc_get_cell(structs);
+       r = bio_detain(cache, block, bio, cell_prealloc,
+                      (cell_free_fn) prealloc_put_cell,
+                      structs, &new_ocell);
+       if (r > 0)
+               return;
+
+       process_cell(cache, structs, new_ocell);
+}
+
 static int need_commit_due_to_time(struct cache *cache)
 {
-       return !time_in_range(jiffies, cache->last_commit_jiffies,
-                             cache->last_commit_jiffies + COMMIT_PERIOD);
+       return jiffies < cache->last_commit_jiffies ||
+              jiffies > cache->last_commit_jiffies + COMMIT_PERIOD;
+}
+
+/*
+ * A non-zero return indicates read_only or fail_io mode.
+ */
+static int commit(struct cache *cache, bool clean_shutdown)
+{
+       int r;
+
+       if (get_cache_mode(cache) >= CM_READ_ONLY)
+               return -EINVAL;
+
+       atomic_inc(&cache->stats.commit_count);
+       r = dm_cache_commit(cache->cmd, clean_shutdown);
+       if (r)
+               metadata_operation_failed(cache, "dm_cache_commit", r);
+
+       return r;
 }
 
 static int commit_if_needed(struct cache *cache)
@@ -1567,9 +1937,8 @@ static int commit_if_needed(struct cache *cache)
 
        if ((cache->commit_requested || need_commit_due_to_time(cache)) &&
            dm_cache_changed_this_transaction(cache->cmd)) {
-               atomic_inc(&cache->stats.commit_count);
+               r = commit(cache, false);
                cache->commit_requested = false;
-               r = dm_cache_commit(cache->cmd, false);
                cache->last_commit_jiffies = jiffies;
        }
 
@@ -1617,6 +1986,40 @@ static void process_deferred_bios(struct cache *cache)
        prealloc_free_structs(cache, &structs);
 }
 
+static void process_deferred_cells(struct cache *cache)
+{
+       unsigned long flags;
+       struct dm_bio_prison_cell *cell, *tmp;
+       struct list_head cells;
+       struct prealloc structs;
+
+       memset(&structs, 0, sizeof(structs));
+
+       INIT_LIST_HEAD(&cells);
+
+       spin_lock_irqsave(&cache->lock, flags);
+       list_splice_init(&cache->deferred_cells, &cells);
+       spin_unlock_irqrestore(&cache->lock, flags);
+
+       list_for_each_entry_safe(cell, tmp, &cells, user_list) {
+               /*
+                * If we've got no free migration structs, and processing
+                * this bio might require one, we pause until there are some
+                * prepared mappings to process.
+                */
+               if (prealloc_data_structs(cache, &structs)) {
+                       spin_lock_irqsave(&cache->lock, flags);
+                       list_splice(&cells, &cache->deferred_cells);
+                       spin_unlock_irqrestore(&cache->lock, flags);
+                       break;
+               }
+
+               process_cell(cache, &structs, cell);
+       }
+
+       prealloc_free_structs(cache, &structs);
+}
+
 static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
 {
        unsigned long flags;
@@ -1634,7 +2037,7 @@ static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
         * These bios have already been through inc_ds()
         */
        while ((bio = bio_list_pop(&bios)))
-               submit_bios ? generic_make_request(bio) : bio_io_error(bio);
+               submit_bios ? accounted_request(cache, bio) : bio_io_error(bio);
 }
 
 static void process_deferred_writethrough_bios(struct cache *cache)
@@ -1654,7 +2057,7 @@ static void process_deferred_writethrough_bios(struct cache *cache)
         * These bios have already been through inc_ds()
         */
        while ((bio = bio_list_pop(&bios)))
-               generic_make_request(bio);
+               accounted_request(cache, bio);
 }
 
 static void writeback_some_dirty_blocks(struct cache *cache)
@@ -1664,6 +2067,7 @@ static void writeback_some_dirty_blocks(struct cache *cache)
        dm_cblock_t cblock;
        struct prealloc structs;
        struct dm_bio_prison_cell *old_ocell;
+       bool busy = !iot_idle_for(&cache->origin_tracker, HZ);
 
        memset(&structs, 0, sizeof(structs));
 
@@ -1671,7 +2075,7 @@ static void writeback_some_dirty_blocks(struct cache *cache)
                if (prealloc_data_structs(cache, &structs))
                        break;
 
-               r = policy_writeback_work(cache->policy, &oblock, &cblock);
+               r = policy_writeback_work(cache->policy, &oblock, &cblock, busy);
                if (r)
                        break;
 
@@ -1702,15 +2106,17 @@ static void process_invalidation_request(struct cache *cache, struct invalidatio
                r = policy_remove_cblock(cache->policy, to_cblock(begin));
                if (!r) {
                        r = dm_cache_remove_mapping(cache->cmd, to_cblock(begin));
-                       if (r)
+                       if (r) {
+                               metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
                                break;
+                       }
 
                } else if (r == -ENODATA) {
                        /* harmless, already unmapped */
                        r = 0;
 
                } else {
-                       DMERR("policy_remove_cblock failed");
+                       DMERR("%s: policy_remove_cblock failed", cache_device_name(cache));
                        break;
                }
 
@@ -1783,7 +2189,22 @@ static void stop_worker(struct cache *cache)
        flush_workqueue(cache->wq);
 }
 
-static void requeue_deferred_io(struct cache *cache)
+static void requeue_deferred_cells(struct cache *cache)
+{
+       unsigned long flags;
+       struct list_head cells;
+       struct dm_bio_prison_cell *cell, *tmp;
+
+       INIT_LIST_HEAD(&cells);
+       spin_lock_irqsave(&cache->lock, flags);
+       list_splice_init(&cache->deferred_cells, &cells);
+       spin_unlock_irqrestore(&cache->lock, flags);
+
+       list_for_each_entry_safe(cell, tmp, &cells, user_list)
+               cell_requeue(cache, cell);
+}
+
+static void requeue_deferred_bios(struct cache *cache)
 {
        struct bio *bio;
        struct bio_list bios;
@@ -1804,6 +2225,7 @@ static int more_work(struct cache *cache)
                        !list_empty(&cache->need_commit_migrations);
        else
                return !bio_list_empty(&cache->deferred_bios) ||
+                       !list_empty(&cache->deferred_cells) ||
                        !bio_list_empty(&cache->deferred_flush_bios) ||
                        !bio_list_empty(&cache->deferred_writethrough_bios) ||
                        !list_empty(&cache->quiesced_migrations) ||
@@ -1821,6 +2243,7 @@ static void do_worker(struct work_struct *ws)
                        writeback_some_dirty_blocks(cache);
                        process_deferred_writethrough_bios(cache);
                        process_deferred_bios(cache);
+                       process_deferred_cells(cache);
                        process_invalidation_requests(cache);
                }
 
@@ -1830,11 +2253,6 @@ static void do_worker(struct work_struct *ws)
                if (commit_if_needed(cache)) {
                        process_deferred_flush_bios(cache, false);
                        process_migrations(cache, &cache->need_commit_migrations, migration_failure);
-
-                       /*
-                        * FIXME: rollback metadata or just go into a
-                        * failure mode and error everything
-                        */
                } else {
                        process_deferred_flush_bios(cache, true);
                        process_migrations(cache, &cache->need_commit_migrations,
@@ -1853,7 +2271,7 @@ static void do_worker(struct work_struct *ws)
 static void do_waker(struct work_struct *ws)
 {
        struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
-       policy_tick(cache->policy);
+       policy_tick(cache->policy, true);
        wake_worker(cache);
        queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
 }
@@ -2407,6 +2825,12 @@ static int cache_create(struct cache_args *ca, struct cache **result)
                goto bad;
        }
        cache->cmd = cmd;
+       set_cache_mode(cache, CM_WRITE);
+       if (get_cache_mode(cache) != CM_WRITE) {
+               *error = "Unable to get write access to metadata, please check/repair metadata.";
+               r = -EINVAL;
+               goto bad;
+       }
 
        if (passthrough_mode(&cache->features)) {
                bool all_clean;
@@ -2425,6 +2849,7 @@ static int cache_create(struct cache_args *ca, struct cache **result)
        }
 
        spin_lock_init(&cache->lock);
+       INIT_LIST_HEAD(&cache->deferred_cells);
        bio_list_init(&cache->deferred_bios);
        bio_list_init(&cache->deferred_flush_bios);
        bio_list_init(&cache->deferred_writethrough_bios);
@@ -2514,6 +2939,8 @@ static int cache_create(struct cache_args *ca, struct cache **result)
        spin_lock_init(&cache->invalidation_lock);
        INIT_LIST_HEAD(&cache->invalidation_requests);
 
+       iot_init(&cache->origin_tracker);
+
        *result = cache;
        return 0;
 
@@ -2580,15 +3007,23 @@ out:
        return r;
 }
 
-static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell **cell)
+/*----------------------------------------------------------------*/
+
+static int cache_map(struct dm_target *ti, struct bio *bio)
 {
+       struct cache *cache = ti->private;
+
        int r;
+       struct dm_bio_prison_cell *cell = NULL;
        dm_oblock_t block = get_bio_block(cache, bio);
        size_t pb_data_size = get_per_bio_data_size(cache);
        bool can_migrate = false;
-       bool discarded_block;
+       bool fast_promotion;
        struct policy_result lookup_result;
        struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size);
+       struct old_oblock_lock ool;
+
+       ool.locker.fn = null_locker;
 
        if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) {
                /*
@@ -2597,10 +3032,11 @@ static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_priso
                 * Just remap to the origin and carry on.
                 */
                remap_to_origin(cache, bio);
+               accounted_begin(cache, bio);
                return DM_MAPIO_REMAPPED;
        }
 
-       if (bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD)) {
+       if (discard_or_flush(bio)) {
                defer_bio(cache, bio);
                return DM_MAPIO_SUBMITTED;
        }
@@ -2608,15 +3044,15 @@ static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_priso
        /*
         * Check to see if that block is currently migrating.
         */
-       *cell = alloc_prison_cell(cache);
-       if (!*cell) {
+       cell = alloc_prison_cell(cache);
+       if (!cell) {
                defer_bio(cache, bio);
                return DM_MAPIO_SUBMITTED;
        }
 
-       r = bio_detain(cache, block, bio, *cell,
+       r = bio_detain(cache, block, bio, cell,
                       (cell_free_fn) free_prison_cell,
-                      cache, cell);
+                      cache, &cell);
        if (r) {
                if (r < 0)
                        defer_bio(cache, bio);
@@ -2624,17 +3060,18 @@ static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_priso
                return DM_MAPIO_SUBMITTED;
        }
 
-       discarded_block = is_discarded_oblock(cache, block);
+       fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio);
 
-       r = policy_map(cache->policy, block, false, can_migrate, discarded_block,
-                      bio, &lookup_result);
+       r = policy_map(cache->policy, block, false, can_migrate, fast_promotion,
+                      bio, &ool.locker, &lookup_result);
        if (r == -EWOULDBLOCK) {
-               cell_defer(cache, *cell, true);
+               cell_defer(cache, cell, true);
                return DM_MAPIO_SUBMITTED;
 
        } else if (r) {
-               DMERR_LIMIT("Unexpected return from cache replacement policy: %d", r);
-               cell_defer(cache, *cell, false);
+               DMERR_LIMIT("%s: Unexpected return from cache replacement policy: %d",
+                           cache_device_name(cache), r);
+               cell_defer(cache, cell, false);
                bio_io_error(bio);
                return DM_MAPIO_SUBMITTED;
        }
@@ -2648,21 +3085,30 @@ static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_priso
                                 * We need to invalidate this block, so
                                 * defer for the worker thread.
                                 */
-                               cell_defer(cache, *cell, true);
+                               cell_defer(cache, cell, true);
                                r = DM_MAPIO_SUBMITTED;
 
                        } else {
                                inc_miss_counter(cache, bio);
                                remap_to_origin_clear_discard(cache, bio, block);
+                               accounted_begin(cache, bio);
+                               inc_ds(cache, bio, cell);
+                               // FIXME: we want to remap hits or misses straight
+                               // away rather than passing over to the worker.
+                               cell_defer(cache, cell, false);
                        }
 
                } else {
                        inc_hit_counter(cache, bio);
                        if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) &&
-                           !is_dirty(cache, lookup_result.cblock))
+                           !is_dirty(cache, lookup_result.cblock)) {
                                remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
-                       else
-                               remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
+                               accounted_begin(cache, bio);
+                               inc_ds(cache, bio, cell);
+                               cell_defer(cache, cell, false);
+
+                       } else
+                               remap_cell_to_cache_dirty(cache, cell, block, lookup_result.cblock, false);
                }
                break;
 
@@ -2674,18 +3120,19 @@ static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_priso
                         * longer needed because the block has been demoted.
                         */
                        bio_endio(bio, 0);
-                       cell_defer(cache, *cell, false);
+                       // FIXME: remap everything as a miss
+                       cell_defer(cache, cell, false);
                        r = DM_MAPIO_SUBMITTED;
 
                } else
-                       remap_to_origin_clear_discard(cache, bio, block);
-
+                       remap_cell_to_origin_clear_discard(cache, cell, block, false);
                break;
 
        default:
-               DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__,
+               DMERR_LIMIT("%s: %s: erroring bio: unknown policy op: %u",
+                           cache_device_name(cache), __func__,
                            (unsigned) lookup_result.op);
-               cell_defer(cache, *cell, false);
+               cell_defer(cache, cell, false);
                bio_io_error(bio);
                r = DM_MAPIO_SUBMITTED;
        }
@@ -2693,21 +3140,6 @@ static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_priso
        return r;
 }
 
-static int cache_map(struct dm_target *ti, struct bio *bio)
-{
-       int r;
-       struct dm_bio_prison_cell *cell = NULL;
-       struct cache *cache = ti->private;
-
-       r = __cache_map(cache, bio, &cell);
-       if (r == DM_MAPIO_REMAPPED && cell) {
-               inc_ds(cache, bio, cell);
-               cell_defer(cache, cell, false);
-       }
-
-       return r;
-}
-
 static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
 {
        struct cache *cache = ti->private;
@@ -2716,7 +3148,7 @@ static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
        struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
 
        if (pb->tick) {
-               policy_tick(cache->policy);
+               policy_tick(cache->policy, false);
 
                spin_lock_irqsave(&cache->lock, flags);
                cache->need_tick_bio = true;
@@ -2724,6 +3156,7 @@ static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
        }
 
        check_for_quiesced_migrations(cache, pb);
+       accounted_complete(cache, bio);
 
        return 0;
 }
@@ -2732,11 +3165,16 @@ static int write_dirty_bitset(struct cache *cache)
 {
        unsigned i, r;
 
+       if (get_cache_mode(cache) >= CM_READ_ONLY)
+               return -EINVAL;
+
        for (i = 0; i < from_cblock(cache->cache_size); i++) {
                r = dm_cache_set_dirty(cache->cmd, to_cblock(i),
                                       is_dirty(cache, to_cblock(i)));
-               if (r)
+               if (r) {
+                       metadata_operation_failed(cache, "dm_cache_set_dirty", r);
                        return r;
+               }
        }
 
        return 0;
@@ -2746,18 +3184,40 @@ static int write_discard_bitset(struct cache *cache)
 {
        unsigned i, r;
 
+       if (get_cache_mode(cache) >= CM_READ_ONLY)
+               return -EINVAL;
+
        r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size,
                                           cache->discard_nr_blocks);
        if (r) {
-               DMERR("could not resize on-disk discard bitset");
+               DMERR("%s: could not resize on-disk discard bitset", cache_device_name(cache));
+               metadata_operation_failed(cache, "dm_cache_discard_bitset_resize", r);
                return r;
        }
 
        for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) {
                r = dm_cache_set_discard(cache->cmd, to_dblock(i),
                                         is_discarded(cache, to_dblock(i)));
-               if (r)
+               if (r) {
+                       metadata_operation_failed(cache, "dm_cache_set_discard", r);
                        return r;
+               }
+       }
+
+       return 0;
+}
+
+static int write_hints(struct cache *cache)
+{
+       int r;
+
+       if (get_cache_mode(cache) >= CM_READ_ONLY)
+               return -EINVAL;
+
+       r = dm_cache_write_hints(cache->cmd, cache->policy);
+       if (r) {
+               metadata_operation_failed(cache, "dm_cache_write_hints", r);
+               return r;
        }
 
        return 0;
@@ -2772,26 +3232,26 @@ static bool sync_metadata(struct cache *cache)
 
        r1 = write_dirty_bitset(cache);
        if (r1)
-               DMERR("could not write dirty bitset");
+               DMERR("%s: could not write dirty bitset", cache_device_name(cache));
 
        r2 = write_discard_bitset(cache);
        if (r2)
-               DMERR("could not write discard bitset");
+               DMERR("%s: could not write discard bitset", cache_device_name(cache));
 
        save_stats(cache);
 
-       r3 = dm_cache_write_hints(cache->cmd, cache->policy);
+       r3 = write_hints(cache);
        if (r3)
-               DMERR("could not write hints");
+               DMERR("%s: could not write hints", cache_device_name(cache));
 
        /*
         * If writing the above metadata failed, we still commit, but don't
         * set the clean shutdown flag.  This will effectively force every
         * dirty bit to be set on reload.
         */
-       r4 = dm_cache_commit(cache->cmd, !r1 && !r2 && !r3);
+       r4 = commit(cache, !r1 && !r2 && !r3);
        if (r4)
-               DMERR("could not write cache metadata.  Data loss may occur.");
+               DMERR("%s: could not write cache metadata", cache_device_name(cache));
 
        return !r1 && !r2 && !r3 && !r4;
 }
@@ -2803,10 +3263,12 @@ static void cache_postsuspend(struct dm_target *ti)
        start_quiescing(cache);
        wait_for_migrations(cache);
        stop_worker(cache);
-       requeue_deferred_io(cache);
+       requeue_deferred_bios(cache);
+       requeue_deferred_cells(cache);
        stop_quiescing(cache);
 
-       (void) sync_metadata(cache);
+       if (get_cache_mode(cache) == CM_WRITE)
+               (void) sync_metadata(cache);
 }
 
 static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
@@ -2929,7 +3391,8 @@ static bool can_resize(struct cache *cache, dm_cblock_t new_size)
        while (from_cblock(new_size) < from_cblock(cache->cache_size)) {
                new_size = to_cblock(from_cblock(new_size) + 1);
                if (is_dirty(cache, new_size)) {
-                       DMERR("unable to shrink cache; cache block %llu is dirty",
+                       DMERR("%s: unable to shrink cache; cache block %llu is dirty",
+                             cache_device_name(cache),
                              (unsigned long long) from_cblock(new_size));
                        return false;
                }
@@ -2944,7 +3407,8 @@ static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size)
 
        r = dm_cache_resize(cache->cmd, new_size);
        if (r) {
-               DMERR("could not resize cache metadata");
+               DMERR("%s: could not resize cache metadata", cache_device_name(cache));
+               metadata_operation_failed(cache, "dm_cache_resize", r);
                return r;
        }
 
@@ -2982,7 +3446,8 @@ static int cache_preresume(struct dm_target *ti)
                r = dm_cache_load_mappings(cache->cmd, cache->policy,
                                           load_mapping, cache);
                if (r) {
-                       DMERR("could not load cache mappings");
+                       DMERR("%s: could not load cache mappings", cache_device_name(cache));
+                       metadata_operation_failed(cache, "dm_cache_load_mappings", r);
                        return r;
                }
 
@@ -3002,7 +3467,8 @@ static int cache_preresume(struct dm_target *ti)
                discard_load_info_init(cache, &li);
                r = dm_cache_load_discards(cache->cmd, load_discard, &li);
                if (r) {
-                       DMERR("could not load origin discards");
+                       DMERR("%s: could not load origin discards", cache_device_name(cache));
+                       metadata_operation_failed(cache, "dm_cache_load_discards", r);
                        return r;
                }
                set_discard_range(&li);
@@ -3030,7 +3496,7 @@ static void cache_resume(struct dm_target *ti)
  * <#demotions> <#promotions> <#dirty>
  * <#features> <features>*
  * <#core args> <core args>
- * <policy name> <#policy args> <policy args>*
+ * <policy name> <#policy args> <policy args>* <cache metadata mode>
  */
 static void cache_status(struct dm_target *ti, status_type_t type,
                         unsigned status_flags, char *result, unsigned maxlen)
@@ -3046,23 +3512,26 @@ static void cache_status(struct dm_target *ti, status_type_t type,
 
        switch (type) {
        case STATUSTYPE_INFO:
-               /* Commit to ensure statistics aren't out-of-date */
-               if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) {
-                       r = dm_cache_commit(cache->cmd, false);
-                       if (r)
-                               DMERR("could not commit metadata for accurate status");
+               if (get_cache_mode(cache) == CM_FAIL) {
+                       DMEMIT("Fail");
+                       break;
                }
 
-               r = dm_cache_get_free_metadata_block_count(cache->cmd,
-                                                          &nr_free_blocks_metadata);
+               /* Commit to ensure statistics aren't out-of-date */
+               if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
+                       (void) commit(cache, false);
+
+               r = dm_cache_get_free_metadata_block_count(cache->cmd, &nr_free_blocks_metadata);
                if (r) {
-                       DMERR("could not get metadata free block count");
+                       DMERR("%s: dm_cache_get_free_metadata_block_count returned %d",
+                             cache_device_name(cache), r);
                        goto err;
                }
 
                r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata);
                if (r) {
-                       DMERR("could not get metadata device size");
+                       DMERR("%s: dm_cache_get_metadata_dev_size returned %d",
+                             cache_device_name(cache), r);
                        goto err;
                }
 
@@ -3093,7 +3562,8 @@ static void cache_status(struct dm_target *ti, status_type_t type,
                        DMEMIT("1 writeback ");
 
                else {
-                       DMERR("internal error: unknown io mode: %d", (int) cache->features.io_mode);
+                       DMERR("%s: internal error: unknown io mode: %d",
+                             cache_device_name(cache), (int) cache->features.io_mode);
                        goto err;
                }
 
@@ -3101,11 +3571,17 @@ static void cache_status(struct dm_target *ti, status_type_t type,
 
                DMEMIT("%s ", dm_cache_policy_get_name(cache->policy));
                if (sz < maxlen) {
-                       r = policy_emit_config_values(cache->policy, result + sz, maxlen - sz);
+                       r = policy_emit_config_values(cache->policy, result, maxlen, &sz);
                        if (r)
-                               DMERR("policy_emit_config_values returned %d", r);
+                               DMERR("%s: policy_emit_config_values returned %d",
+                                     cache_device_name(cache), r);
                }
 
+               if (get_cache_mode(cache) == CM_READ_ONLY)
+                       DMEMIT("ro ");
+               else
+                       DMEMIT("rw ");
+
                break;
 
        case STATUSTYPE_TABLE:
@@ -3167,7 +3643,7 @@ static int parse_cblock_range(struct cache *cache, const char *str,
                return 0;
        }
 
-       DMERR("invalid cblock range '%s'", str);
+       DMERR("%s: invalid cblock range '%s'", cache_device_name(cache), str);
        return -EINVAL;
 }
 
@@ -3178,17 +3654,20 @@ static int validate_cblock_range(struct cache *cache, struct cblock_range *range
        uint64_t n = from_cblock(cache->cache_size);
 
        if (b >= n) {
-               DMERR("begin cblock out of range: %llu >= %llu", b, n);
+               DMERR("%s: begin cblock out of range: %llu >= %llu",
+                     cache_device_name(cache), b, n);
                return -EINVAL;
        }
 
        if (e > n) {
-               DMERR("end cblock out of range: %llu > %llu", e, n);
+               DMERR("%s: end cblock out of range: %llu > %llu",
+                     cache_device_name(cache), e, n);
                return -EINVAL;
        }
 
        if (b >= e) {
-               DMERR("invalid cblock range: %llu >= %llu", b, e);
+               DMERR("%s: invalid cblock range: %llu >= %llu",
+                     cache_device_name(cache), b, e);
                return -EINVAL;
        }
 
@@ -3222,7 +3701,8 @@ static int process_invalidate_cblocks_message(struct cache *cache, unsigned coun
        struct cblock_range range;
 
        if (!passthrough_mode(&cache->features)) {
-               DMERR("cache has to be in passthrough mode for invalidation");
+               DMERR("%s: cache has to be in passthrough mode for invalidation",
+                     cache_device_name(cache));
                return -EPERM;
        }
 
@@ -3261,6 +3741,12 @@ static int cache_message(struct dm_target *ti, unsigned argc, char **argv)
        if (!argc)
                return -EINVAL;
 
+       if (get_cache_mode(cache) >= CM_READ_ONLY) {
+               DMERR("%s: unable to service cache target messages in READ_ONLY or FAIL mode",
+                     cache_device_name(cache));
+               return -EOPNOTSUPP;
+       }
+
        if (!strcasecmp(argv[0], "invalidate_cblocks"))
                return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1);
 
@@ -3334,7 +3820,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
 
 static struct target_type cache_target = {
        .name = "cache",
-       .version = {1, 6, 0},
+       .version = {1, 7, 0},
        .module = THIS_MODULE,
        .ctr = cache_ctr,
        .dtr = cache_dtr,
index 5503e43..0f48fed 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * Copyright (C) 2003 Jana Saout <jana@saout.de>
  * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org>
- * Copyright (C) 2006-2009 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2006-2015 Red Hat, Inc. All rights reserved.
  * Copyright (C) 2013 Milan Broz <gmazyland@gmail.com>
  *
  * This file is released under the GPL.
@@ -891,6 +891,11 @@ static void crypt_alloc_req(struct crypt_config *cc,
                ctx->req = mempool_alloc(cc->req_pool, GFP_NOIO);
 
        ablkcipher_request_set_tfm(ctx->req, cc->tfms[key_index]);
+
+       /*
+        * Use REQ_MAY_BACKLOG so a cipher driver internally backlogs
+        * requests if driver request queue is full.
+        */
        ablkcipher_request_set_callback(ctx->req,
            CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
            kcryptd_async_done, dmreq_of_req(cc, ctx->req));
@@ -924,24 +929,32 @@ static int crypt_convert(struct crypt_config *cc,
                r = crypt_convert_block(cc, ctx, ctx->req);
 
                switch (r) {
-               /* async */
+               /*
+                * The request was queued by a crypto driver
+                * but the driver request queue is full, let's wait.
+                */
                case -EBUSY:
                        wait_for_completion(&ctx->restart);
                        reinit_completion(&ctx->restart);
-                       /* fall through*/
+                       /* fall through */
+               /*
+                * The request is queued and processed asynchronously,
+                * completion function kcryptd_async_done() will be called.
+                */
                case -EINPROGRESS:
                        ctx->req = NULL;
                        ctx->cc_sector++;
                        continue;
-
-               /* sync */
+               /*
+                * The request was already processed (synchronously).
+                */
                case 0:
                        atomic_dec(&ctx->cc_pending);
                        ctx->cc_sector++;
                        cond_resched();
                        continue;
 
-               /* error */
+               /* There was an error while processing the request. */
                default:
                        atomic_dec(&ctx->cc_pending);
                        return r;
@@ -1346,6 +1359,11 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
        struct dm_crypt_io *io = container_of(ctx, struct dm_crypt_io, ctx);
        struct crypt_config *cc = io->cc;
 
+       /*
+        * A request from crypto driver backlog is going to be processed now,
+        * finish the completion and continue in crypt_convert().
+        * (Callback will be called for the second time for this request.)
+        */
        if (error == -EINPROGRESS) {
                complete(&ctx->restart);
                return;
index 93e0844..ad1b049 100644 (file)
@@ -55,8 +55,8 @@
 #define LOG_DISCARD_FLAG (1 << 2)
 #define LOG_MARK_FLAG (1 << 3)
 
-#define WRITE_LOG_VERSION 1
-#define WRITE_LOG_MAGIC 0x6a736677736872
+#define WRITE_LOG_VERSION 1ULL
+#define WRITE_LOG_MAGIC 0x6a736677736872ULL
 
 /*
  * The disk format for this is braindead simple.
index 88e4c7f..2daa677 100644 (file)
@@ -1,6 +1,6 @@
 /*
  * Copyright (C) 2010-2011 Neil Brown
- * Copyright (C) 2010-2014 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2010-2015 Red Hat, Inc. All rights reserved.
  *
  * This file is released under the GPL.
  */
@@ -17,6 +17,7 @@
 #include <linux/device-mapper.h>
 
 #define DM_MSG_PREFIX "raid"
+#define        MAX_RAID_DEVICES        253 /* raid4/5/6 limit */
 
 static bool devices_handle_discard_safely = false;
 
@@ -45,25 +46,25 @@ struct raid_dev {
 };
 
 /*
- * Flags for rs->print_flags field.
+ * Flags for rs->ctr_flags field.
  */
-#define DMPF_SYNC              0x1
-#define DMPF_NOSYNC            0x2
-#define DMPF_REBUILD           0x4
-#define DMPF_DAEMON_SLEEP      0x8
-#define DMPF_MIN_RECOVERY_RATE 0x10
-#define DMPF_MAX_RECOVERY_RATE 0x20
-#define DMPF_MAX_WRITE_BEHIND  0x40
-#define DMPF_STRIPE_CACHE      0x80
-#define DMPF_REGION_SIZE       0x100
-#define DMPF_RAID10_COPIES     0x200
-#define DMPF_RAID10_FORMAT     0x400
+#define CTR_FLAG_SYNC              0x1
+#define CTR_FLAG_NOSYNC            0x2
+#define CTR_FLAG_REBUILD           0x4
+#define CTR_FLAG_DAEMON_SLEEP      0x8
+#define CTR_FLAG_MIN_RECOVERY_RATE 0x10
+#define CTR_FLAG_MAX_RECOVERY_RATE 0x20
+#define CTR_FLAG_MAX_WRITE_BEHIND  0x40
+#define CTR_FLAG_STRIPE_CACHE      0x80
+#define CTR_FLAG_REGION_SIZE       0x100
+#define CTR_FLAG_RAID10_COPIES     0x200
+#define CTR_FLAG_RAID10_FORMAT     0x400
 
 struct raid_set {
        struct dm_target *ti;
 
        uint32_t bitmap_loaded;
-       uint32_t print_flags;
+       uint32_t ctr_flags;
 
        struct mddev md;
        struct raid_type *raid_type;
@@ -81,6 +82,7 @@ static struct raid_type {
        const unsigned level;           /* RAID level. */
        const unsigned algorithm;       /* RAID algorithm. */
 } raid_types[] = {
+       {"raid0",    "RAID0 (striping)",                0, 2, 0, 0 /* NONE */},
        {"raid1",    "RAID1 (mirroring)",               0, 2, 1, 0 /* NONE */},
        {"raid10",   "RAID10 (striped mirrors)",        0, 2, 10, UINT_MAX /* Varies */},
        {"raid4",    "RAID4 (dedicated parity disk)",   1, 2, 5, ALGORITHM_PARITY_0},
@@ -119,15 +121,15 @@ static int raid10_format_to_md_layout(char *format, unsigned copies)
 {
        unsigned n = 1, f = 1;
 
-       if (!strcmp("near", format))
+       if (!strcasecmp("near", format))
                n = copies;
        else
                f = copies;
 
-       if (!strcmp("offset", format))
+       if (!strcasecmp("offset", format))
                return 0x30000 | (f << 8) | n;
 
-       if (!strcmp("far", format))
+       if (!strcasecmp("far", format))
                return 0x20000 | (f << 8) | n;
 
        return (f << 8) | n;
@@ -477,8 +479,6 @@ too_many:
  *                                      will form the "stripe"
  *    [[no]sync]                       Force or prevent recovery of the
  *                                      entire array
- *    [devices_handle_discard_safely]  Allow discards on RAID4/5/6; useful if RAID
- *                                     member device(s) properly support TRIM/UNMAP
  *    [rebuild <idx>]                  Rebuild the drive indicated by the index
  *    [daemon_sleep <ms>]              Time between bitmap daemon work to
  *                                      clear bits
@@ -555,12 +555,12 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
        for (i = 0; i < num_raid_params; i++) {
                if (!strcasecmp(argv[i], "nosync")) {
                        rs->md.recovery_cp = MaxSector;
-                       rs->print_flags |= DMPF_NOSYNC;
+                       rs->ctr_flags |= CTR_FLAG_NOSYNC;
                        continue;
                }
                if (!strcasecmp(argv[i], "sync")) {
                        rs->md.recovery_cp = 0;
-                       rs->print_flags |= DMPF_SYNC;
+                       rs->ctr_flags |= CTR_FLAG_SYNC;
                        continue;
                }
 
@@ -585,7 +585,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
                                return -EINVAL;
                        }
                        raid10_format = argv[i];
-                       rs->print_flags |= DMPF_RAID10_FORMAT;
+                       rs->ctr_flags |= CTR_FLAG_RAID10_FORMAT;
                        continue;
                }
 
@@ -602,7 +602,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
                        }
                        clear_bit(In_sync, &rs->dev[value].rdev.flags);
                        rs->dev[value].rdev.recovery_offset = 0;
-                       rs->print_flags |= DMPF_REBUILD;
+                       rs->ctr_flags |= CTR_FLAG_REBUILD;
                } else if (!strcasecmp(key, "write_mostly")) {
                        if (rs->raid_type->level != 1) {
                                rs->ti->error = "write_mostly option is only valid for RAID1";
@@ -618,7 +618,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
                                rs->ti->error = "max_write_behind option is only valid for RAID1";
                                return -EINVAL;
                        }
-                       rs->print_flags |= DMPF_MAX_WRITE_BEHIND;
+                       rs->ctr_flags |= CTR_FLAG_MAX_WRITE_BEHIND;
 
                        /*
                         * In device-mapper, we specify things in sectors, but
@@ -631,14 +631,14 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
                        }
                        rs->md.bitmap_info.max_write_behind = value;
                } else if (!strcasecmp(key, "daemon_sleep")) {
-                       rs->print_flags |= DMPF_DAEMON_SLEEP;
+                       rs->ctr_flags |= CTR_FLAG_DAEMON_SLEEP;
                        if (!value || (value > MAX_SCHEDULE_TIMEOUT)) {
                                rs->ti->error = "daemon sleep period out of range";
                                return -EINVAL;
                        }
                        rs->md.bitmap_info.daemon_sleep = value;
                } else if (!strcasecmp(key, "stripe_cache")) {
-                       rs->print_flags |= DMPF_STRIPE_CACHE;
+                       rs->ctr_flags |= CTR_FLAG_STRIPE_CACHE;
 
                        /*
                         * In device-mapper, we specify things in sectors, but
@@ -656,21 +656,21 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
                                return -EINVAL;
                        }
                } else if (!strcasecmp(key, "min_recovery_rate")) {
-                       rs->print_flags |= DMPF_MIN_RECOVERY_RATE;
+                       rs->ctr_flags |= CTR_FLAG_MIN_RECOVERY_RATE;
                        if (value > INT_MAX) {
                                rs->ti->error = "min_recovery_rate out of range";
                                return -EINVAL;
                        }
                        rs->md.sync_speed_min = (int)value;
                } else if (!strcasecmp(key, "max_recovery_rate")) {
-                       rs->print_flags |= DMPF_MAX_RECOVERY_RATE;
+                       rs->ctr_flags |= CTR_FLAG_MAX_RECOVERY_RATE;
                        if (value > INT_MAX) {
                                rs->ti->error = "max_recovery_rate out of range";
                                return -EINVAL;
                        }
                        rs->md.sync_speed_max = (int)value;
                } else if (!strcasecmp(key, "region_size")) {
-                       rs->print_flags |= DMPF_REGION_SIZE;
+                       rs->ctr_flags |= CTR_FLAG_REGION_SIZE;
                        region_size = value;
                } else if (!strcasecmp(key, "raid10_copies") &&
                           (rs->raid_type->level == 10)) {
@@ -678,7 +678,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
                                rs->ti->error = "Bad value for 'raid10_copies'";
                                return -EINVAL;
                        }
-                       rs->print_flags |= DMPF_RAID10_COPIES;
+                       rs->ctr_flags |= CTR_FLAG_RAID10_COPIES;
                        raid10_copies = value;
                } else {
                        DMERR("Unable to parse RAID parameter: %s", key);
@@ -720,7 +720,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
                rs->md.layout = raid10_format_to_md_layout(raid10_format,
                                                           raid10_copies);
                rs->md.new_layout = rs->md.layout;
-       } else if ((rs->raid_type->level > 1) &&
+       } else if ((!rs->raid_type->level || rs->raid_type->level > 1) &&
                   sector_div(sectors_per_dev,
                              (rs->md.raid_disks - rs->raid_type->parity_devs))) {
                rs->ti->error = "Target length not divisible by number of data devices";
@@ -947,7 +947,7 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
                return -EINVAL;
        }
 
-       if (!(rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC)))
+       if (!(rs->ctr_flags & (CTR_FLAG_SYNC | CTR_FLAG_NOSYNC)))
                mddev->recovery_cp = le64_to_cpu(sb->array_resync_offset);
 
        /*
@@ -1026,8 +1026,9 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
        return 0;
 }
 
-static int super_validate(struct mddev *mddev, struct md_rdev *rdev)
+static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
 {
+       struct mddev *mddev = &rs->md;
        struct dm_raid_superblock *sb = page_address(rdev->sb_page);
 
        /*
@@ -1037,8 +1038,10 @@ static int super_validate(struct mddev *mddev, struct md_rdev *rdev)
        if (!mddev->events && super_init_validation(mddev, rdev))
                return -EINVAL;
 
-       mddev->bitmap_info.offset = 4096 >> 9; /* Enable bitmap creation */
-       rdev->mddev->bitmap_info.default_offset = 4096 >> 9;
+       /* Enable bitmap creation for RAID levels != 0 */
+       mddev->bitmap_info.offset = (rs->raid_type->level) ? to_sector(4096) : 0;
+       rdev->mddev->bitmap_info.default_offset = mddev->bitmap_info.offset;
+
        if (!test_bit(FirstUse, &rdev->flags)) {
                rdev->recovery_offset = le64_to_cpu(sb->disk_recovery_offset);
                if (rdev->recovery_offset != MaxSector)
@@ -1073,7 +1076,7 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
        freshest = NULL;
        rdev_for_each_safe(rdev, tmp, mddev) {
                /*
-                * Skipping super_load due to DMPF_SYNC will cause
+                * Skipping super_load due to CTR_FLAG_SYNC will cause
                 * the array to undergo initialization again as
                 * though it were new.  This is the intended effect
                 * of the "sync" directive.
@@ -1082,7 +1085,9 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
                 * that the "sync" directive is disallowed during the
                 * reshape.
                 */
-               if (rs->print_flags & DMPF_SYNC)
+               rdev->sectors = to_sector(i_size_read(rdev->bdev->bd_inode));
+
+               if (rs->ctr_flags & CTR_FLAG_SYNC)
                        continue;
 
                if (!rdev->meta_bdev)
@@ -1140,11 +1145,11 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
         * validation for the remaining devices.
         */
        ti->error = "Unable to assemble array: Invalid superblocks";
-       if (super_validate(mddev, freshest))
+       if (super_validate(rs, freshest))
                return -EINVAL;
 
        rdev_for_each(rdev, mddev)
-               if ((rdev != freshest) && super_validate(mddev, rdev))
+               if ((rdev != freshest) && super_validate(rs, rdev))
                        return -EINVAL;
 
        return 0;
@@ -1243,7 +1248,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
        }
 
        if ((kstrtoul(argv[num_raid_params], 10, &num_raid_devs) < 0) ||
-           (num_raid_devs >= INT_MAX)) {
+           (num_raid_devs > MAX_RAID_DEVICES)) {
                ti->error = "Cannot understand number of raid devices";
                return -EINVAL;
        }
@@ -1282,10 +1287,11 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
         */
        configure_discard_support(ti, rs);
 
-       mutex_lock(&rs->md.reconfig_mutex);
+       /* Has to be held on running the array */
+       mddev_lock_nointr(&rs->md);
        ret = md_run(&rs->md);
        rs->md.in_sync = 0; /* Assume already marked dirty */
-       mutex_unlock(&rs->md.reconfig_mutex);
+       mddev_unlock(&rs->md);
 
        if (ret) {
                ti->error = "Fail to run raid array";
@@ -1368,34 +1374,40 @@ static void raid_status(struct dm_target *ti, status_type_t type,
        case STATUSTYPE_INFO:
                DMEMIT("%s %d ", rs->raid_type->name, rs->md.raid_disks);
 
-               if (test_bit(MD_RECOVERY_RUNNING, &rs->md.recovery))
-                       sync = rs->md.curr_resync_completed;
-               else
-                       sync = rs->md.recovery_cp;
-
-               if (sync >= rs->md.resync_max_sectors) {
-                       /*
-                        * Sync complete.
-                        */
+               if (rs->raid_type->level) {
+                       if (test_bit(MD_RECOVERY_RUNNING, &rs->md.recovery))
+                               sync = rs->md.curr_resync_completed;
+                       else
+                               sync = rs->md.recovery_cp;
+
+                       if (sync >= rs->md.resync_max_sectors) {
+                               /*
+                                * Sync complete.
+                                */
+                               array_in_sync = 1;
+                               sync = rs->md.resync_max_sectors;
+                       } else if (test_bit(MD_RECOVERY_REQUESTED, &rs->md.recovery)) {
+                               /*
+                                * If "check" or "repair" is occurring, the array has
+                                * undergone and initial sync and the health characters
+                                * should not be 'a' anymore.
+                                */
+                               array_in_sync = 1;
+                       } else {
+                               /*
+                                * The array may be doing an initial sync, or it may
+                                * be rebuilding individual components.  If all the
+                                * devices are In_sync, then it is the array that is
+                                * being initialized.
+                                */
+                               for (i = 0; i < rs->md.raid_disks; i++)
+                                       if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
+                                               array_in_sync = 1;
+                       }
+               } else {
+                       /* RAID0 */
                        array_in_sync = 1;
                        sync = rs->md.resync_max_sectors;
-               } else if (test_bit(MD_RECOVERY_REQUESTED, &rs->md.recovery)) {
-                       /*
-                        * If "check" or "repair" is occurring, the array has
-                        * undergone and initial sync and the health characters
-                        * should not be 'a' anymore.
-                        */
-                       array_in_sync = 1;
-               } else {
-                       /*
-                        * The array may be doing an initial sync, or it may
-                        * be rebuilding individual components.  If all the
-                        * devices are In_sync, then it is the array that is
-                        * being initialized.
-                        */
-                       for (i = 0; i < rs->md.raid_disks; i++)
-                               if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
-                                       array_in_sync = 1;
                }
 
                /*
@@ -1446,7 +1458,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
        case STATUSTYPE_TABLE:
                /* The string you would use to construct this array */
                for (i = 0; i < rs->md.raid_disks; i++) {
-                       if ((rs->print_flags & DMPF_REBUILD) &&
+                       if ((rs->ctr_flags & CTR_FLAG_REBUILD) &&
                            rs->dev[i].data_dev &&
                            !test_bit(In_sync, &rs->dev[i].rdev.flags))
                                raid_param_cnt += 2; /* for rebuilds */
@@ -1455,33 +1467,33 @@ static void raid_status(struct dm_target *ti, status_type_t type,
                                raid_param_cnt += 2;
                }
 
-               raid_param_cnt += (hweight32(rs->print_flags & ~DMPF_REBUILD) * 2);
-               if (rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC))
+               raid_param_cnt += (hweight32(rs->ctr_flags & ~CTR_FLAG_REBUILD) * 2);
+               if (rs->ctr_flags & (CTR_FLAG_SYNC | CTR_FLAG_NOSYNC))
                        raid_param_cnt--;
 
                DMEMIT("%s %u %u", rs->raid_type->name,
                       raid_param_cnt, rs->md.chunk_sectors);
 
-               if ((rs->print_flags & DMPF_SYNC) &&
+               if ((rs->ctr_flags & CTR_FLAG_SYNC) &&
                    (rs->md.recovery_cp == MaxSector))
                        DMEMIT(" sync");
-               if (rs->print_flags & DMPF_NOSYNC)
+               if (rs->ctr_flags & CTR_FLAG_NOSYNC)
                        DMEMIT(" nosync");
 
                for (i = 0; i < rs->md.raid_disks; i++)
-                       if ((rs->print_flags & DMPF_REBUILD) &&
+                       if ((rs->ctr_flags & CTR_FLAG_REBUILD) &&
                            rs->dev[i].data_dev &&
                            !test_bit(In_sync, &rs->dev[i].rdev.flags))
                                DMEMIT(" rebuild %u", i);
 
-               if (rs->print_flags & DMPF_DAEMON_SLEEP)
+               if (rs->ctr_flags & CTR_FLAG_DAEMON_SLEEP)
                        DMEMIT(" daemon_sleep %lu",
                               rs->md.bitmap_info.daemon_sleep);
 
-               if (rs->print_flags & DMPF_MIN_RECOVERY_RATE)
+               if (rs->ctr_flags & CTR_FLAG_MIN_RECOVERY_RATE)
                        DMEMIT(" min_recovery_rate %d", rs->md.sync_speed_min);
 
-               if (rs->print_flags & DMPF_MAX_RECOVERY_RATE)
+               if (rs->ctr_flags & CTR_FLAG_MAX_RECOVERY_RATE)
                        DMEMIT(" max_recovery_rate %d", rs->md.sync_speed_max);
 
                for (i = 0; i < rs->md.raid_disks; i++)
@@ -1489,11 +1501,11 @@ static void raid_status(struct dm_target *ti, status_type_t type,
                            test_bit(WriteMostly, &rs->dev[i].rdev.flags))
                                DMEMIT(" write_mostly %u", i);
 
-               if (rs->print_flags & DMPF_MAX_WRITE_BEHIND)
+               if (rs->ctr_flags & CTR_FLAG_MAX_WRITE_BEHIND)
                        DMEMIT(" max_write_behind %lu",
                               rs->md.bitmap_info.max_write_behind);
 
-               if (rs->print_flags & DMPF_STRIPE_CACHE) {
+               if (rs->ctr_flags & CTR_FLAG_STRIPE_CACHE) {
                        struct r5conf *conf = rs->md.private;
 
                        /* convert from kiB to sectors */
@@ -1501,15 +1513,15 @@ static void raid_status(struct dm_target *ti, status_type_t type,
                               conf ? conf->max_nr_stripes * 2 : 0);
                }
 
-               if (rs->print_flags & DMPF_REGION_SIZE)
+               if (rs->ctr_flags & CTR_FLAG_REGION_SIZE)
                        DMEMIT(" region_size %lu",
                               rs->md.bitmap_info.chunksize >> 9);
 
-               if (rs->print_flags & DMPF_RAID10_COPIES)
+               if (rs->ctr_flags & CTR_FLAG_RAID10_COPIES)
                        DMEMIT(" raid10_copies %u",
                               raid10_md_layout_to_copies(rs->md.layout));
 
-               if (rs->print_flags & DMPF_RAID10_FORMAT)
+               if (rs->ctr_flags & CTR_FLAG_RAID10_FORMAT)
                        DMEMIT(" raid10_format %s",
                               raid10_md_layout_to_format(rs->md.layout));
 
@@ -1684,26 +1696,48 @@ static void raid_resume(struct dm_target *ti)
 {
        struct raid_set *rs = ti->private;
 
-       set_bit(MD_CHANGE_DEVS, &rs->md.flags);
-       if (!rs->bitmap_loaded) {
-               bitmap_load(&rs->md);
-               rs->bitmap_loaded = 1;
-       } else {
-               /*
-                * A secondary resume while the device is active.
-                * Take this opportunity to check whether any failed
-                * devices are reachable again.
-                */
-               attempt_restore_of_faulty_devices(rs);
+       if (rs->raid_type->level) {
+               set_bit(MD_CHANGE_DEVS, &rs->md.flags);
+
+               if (!rs->bitmap_loaded) {
+                       bitmap_load(&rs->md);
+                       rs->bitmap_loaded = 1;
+               } else {
+                       /*
+                        * A secondary resume while the device is active.
+                        * Take this opportunity to check whether any failed
+                        * devices are reachable again.
+                        */
+                       attempt_restore_of_faulty_devices(rs);
+               }
+
+               clear_bit(MD_RECOVERY_FROZEN, &rs->md.recovery);
        }
 
-       clear_bit(MD_RECOVERY_FROZEN, &rs->md.recovery);
        mddev_resume(&rs->md);
 }
 
+static int raid_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
+                     struct bio_vec *biovec, int max_size)
+{
+       struct raid_set *rs = ti->private;
+       struct md_personality *pers = rs->md.pers;
+
+       if (pers && pers->mergeable_bvec)
+               return min(max_size, pers->mergeable_bvec(&rs->md, bvm, biovec));
+
+       /*
+        * In case we can't request the personality because
+        * the raid set is not running yet
+        *
+        * -> return safe minimum
+        */
+       return rs->md.chunk_sectors;
+}
+
 static struct target_type raid_target = {
        .name = "raid",
-       .version = {1, 6, 0},
+       .version = {1, 7, 0},
        .module = THIS_MODULE,
        .ctr = raid_ctr,
        .dtr = raid_dtr,
@@ -1715,6 +1749,7 @@ static struct target_type raid_target = {
        .presuspend = raid_presuspend,
        .postsuspend = raid_postsuspend,
        .resume = raid_resume,
+       .merge = raid_merge,
 };
 
 static int __init dm_raid_init(void)
index 743fa9b..d83696b 100644 (file)
 
 #define MAX_RECOVERY 1 /* Maximum number of regions recovered in parallel. */
 
-#define DM_RAID1_HANDLE_ERRORS 0x01
+#define DM_RAID1_HANDLE_ERRORS 0x01
+#define DM_RAID1_KEEP_LOG      0x02
 #define errors_handled(p)      ((p)->features & DM_RAID1_HANDLE_ERRORS)
+#define keep_log(p)            ((p)->features & DM_RAID1_KEEP_LOG)
 
 static DECLARE_WAIT_QUEUE_HEAD(_kmirrord_recovery_stopped);
 
@@ -229,7 +231,7 @@ static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type)
        if (m != get_default_mirror(ms))
                goto out;
 
-       if (!ms->in_sync) {
+       if (!ms->in_sync && !keep_log(ms)) {
                /*
                 * Better to issue requests to same failing device
                 * than to risk returning corrupt data.
@@ -370,6 +372,17 @@ static int recover(struct mirror_set *ms, struct dm_region *reg)
        return r;
 }
 
+static void reset_ms_flags(struct mirror_set *ms)
+{
+       unsigned int m;
+
+       ms->leg_failure = 0;
+       for (m = 0; m < ms->nr_mirrors; m++) {
+               atomic_set(&(ms->mirror[m].error_count), 0);
+               ms->mirror[m].error_type = 0;
+       }
+}
+
 static void do_recovery(struct mirror_set *ms)
 {
        struct dm_region *reg;
@@ -398,6 +411,7 @@ static void do_recovery(struct mirror_set *ms)
                /* the sync is complete */
                dm_table_event(ms->ti->table);
                ms->in_sync = 1;
+               reset_ms_flags(ms);
        }
 }
 
@@ -759,7 +773,7 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
                dm_rh_delay(ms->rh, bio);
 
        while ((bio = bio_list_pop(&nosync))) {
-               if (unlikely(ms->leg_failure) && errors_handled(ms)) {
+               if (unlikely(ms->leg_failure) && errors_handled(ms) && !keep_log(ms)) {
                        spin_lock_irq(&ms->lock);
                        bio_list_add(&ms->failures, bio);
                        spin_unlock_irq(&ms->lock);
@@ -803,15 +817,21 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures)
 
                /*
                 * If all the legs are dead, fail the I/O.
-                * If we have been told to handle errors, hold the bio
-                * and wait for userspace to deal with the problem.
+                * If the device has failed and keep_log is enabled,
+                * fail the I/O.
+                *
+                * If we have been told to handle errors, and keep_log
+                * isn't enabled, hold the bio and wait for userspace to
+                * deal with the problem.
+                *
                 * Otherwise pretend that the I/O succeeded. (This would
                 * be wrong if the failed leg returned after reboot and
                 * got replicated back to the good legs.)
                 */
-               if (!get_valid_mirror(ms))
+
+               if (unlikely(!get_valid_mirror(ms) || (keep_log(ms) && ms->log_failure)))
                        bio_endio(bio, -EIO);
-               else if (errors_handled(ms))
+               else if (errors_handled(ms) && !keep_log(ms))
                        hold_bio(ms, bio);
                else
                        bio_endio(bio, 0);
@@ -987,6 +1007,7 @@ static int parse_features(struct mirror_set *ms, unsigned argc, char **argv,
        unsigned num_features;
        struct dm_target *ti = ms->ti;
        char dummy;
+       int i;
 
        *args_used = 0;
 
@@ -1007,15 +1028,25 @@ static int parse_features(struct mirror_set *ms, unsigned argc, char **argv,
                return -EINVAL;
        }
 
-       if (!strcmp("handle_errors", argv[0]))
-               ms->features |= DM_RAID1_HANDLE_ERRORS;
-       else {
-               ti->error = "Unrecognised feature requested";
+       for (i = 0; i < num_features; i++) {
+               if (!strcmp("handle_errors", argv[0]))
+                       ms->features |= DM_RAID1_HANDLE_ERRORS;
+               else if (!strcmp("keep_log", argv[0]))
+                       ms->features |= DM_RAID1_KEEP_LOG;
+               else {
+                       ti->error = "Unrecognised feature requested";
+                       return -EINVAL;
+               }
+
+               argc--;
+               argv++;
+               (*args_used)++;
+       }
+       if (!errors_handled(ms) && keep_log(ms)) {
+               ti->error = "keep_log feature requires the handle_errors feature";
                return -EINVAL;
        }
 
-       (*args_used)++;
-
        return 0;
 }
 
@@ -1029,7 +1060,7 @@ static int parse_features(struct mirror_set *ms, unsigned argc, char **argv,
  * log_type is "core" or "disk"
  * #log_params is between 1 and 3
  *
- * If present, features must be "handle_errors".
+ * If present, supported features are "handle_errors" and "keep_log".
  */
 static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 {
@@ -1363,6 +1394,7 @@ static void mirror_status(struct dm_target *ti, status_type_t type,
                          unsigned status_flags, char *result, unsigned maxlen)
 {
        unsigned int m, sz = 0;
+       int num_feature_args = 0;
        struct mirror_set *ms = (struct mirror_set *) ti->private;
        struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
        char buffer[ms->nr_mirrors + 1];
@@ -1392,8 +1424,17 @@ static void mirror_status(struct dm_target *ti, status_type_t type,
                        DMEMIT(" %s %llu", ms->mirror[m].dev->name,
                               (unsigned long long)ms->mirror[m].offset);
 
-               if (ms->features & DM_RAID1_HANDLE_ERRORS)
-                       DMEMIT(" 1 handle_errors");
+               num_feature_args += !!errors_handled(ms);
+               num_feature_args += !!keep_log(ms);
+               if (num_feature_args) {
+                       DMEMIT(" %d", num_feature_args);
+                       if (errors_handled(ms))
+                               DMEMIT(" handle_errors");
+                       if (keep_log(ms))
+                               DMEMIT(" keep_log");
+               }
+
+               break;
        }
 }
 
@@ -1413,7 +1454,7 @@ static int mirror_iterate_devices(struct dm_target *ti,
 
 static struct target_type mirror_target = {
        .name    = "mirror",
-       .version = {1, 13, 2},
+       .version = {1, 14, 0},
        .module  = THIS_MODULE,
        .ctr     = mirror_ctr,
        .dtr     = mirror_dtr,
index f478a4c..8a8b48f 100644 (file)
@@ -29,30 +29,37 @@ struct dm_stat_percpu {
        unsigned long long io_ticks[2];
        unsigned long long io_ticks_total;
        unsigned long long time_in_queue;
+       unsigned long long *histogram;
 };
 
 struct dm_stat_shared {
        atomic_t in_flight[2];
-       unsigned long stamp;
+       unsigned long long stamp;
        struct dm_stat_percpu tmp;
 };
 
 struct dm_stat {
        struct list_head list_entry;
        int id;
+       unsigned stat_flags;
        size_t n_entries;
        sector_t start;
        sector_t end;
        sector_t step;
+       unsigned n_histogram_entries;
+       unsigned long long *histogram_boundaries;
        const char *program_id;
        const char *aux_data;
        struct rcu_head rcu_head;
        size_t shared_alloc_size;
        size_t percpu_alloc_size;
+       size_t histogram_alloc_size;
        struct dm_stat_percpu *stat_percpu[NR_CPUS];
        struct dm_stat_shared stat_shared[0];
 };
 
+#define STAT_PRECISE_TIMESTAMPS                1
+
 struct dm_stats_last_position {
        sector_t last_sector;
        unsigned last_rw;
@@ -160,10 +167,7 @@ static void dm_kvfree(void *ptr, size_t alloc_size)
 
        free_shared_memory(alloc_size);
 
-       if (is_vmalloc_addr(ptr))
-               vfree(ptr);
-       else
-               kfree(ptr);
+       kvfree(ptr);
 }
 
 static void dm_stat_free(struct rcu_head *head)
@@ -173,8 +177,11 @@ static void dm_stat_free(struct rcu_head *head)
 
        kfree(s->program_id);
        kfree(s->aux_data);
-       for_each_possible_cpu(cpu)
+       for_each_possible_cpu(cpu) {
+               dm_kvfree(s->stat_percpu[cpu][0].histogram, s->histogram_alloc_size);
                dm_kvfree(s->stat_percpu[cpu], s->percpu_alloc_size);
+       }
+       dm_kvfree(s->stat_shared[0].tmp.histogram, s->histogram_alloc_size);
        dm_kvfree(s, s->shared_alloc_size);
 }
 
@@ -227,7 +234,10 @@ void dm_stats_cleanup(struct dm_stats *stats)
 }
 
 static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
-                          sector_t step, const char *program_id, const char *aux_data,
+                          sector_t step, unsigned stat_flags,
+                          unsigned n_histogram_entries,
+                          unsigned long long *histogram_boundaries,
+                          const char *program_id, const char *aux_data,
                           void (*suspend_callback)(struct mapped_device *),
                           void (*resume_callback)(struct mapped_device *),
                           struct mapped_device *md)
@@ -238,6 +248,7 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
        size_t ni;
        size_t shared_alloc_size;
        size_t percpu_alloc_size;
+       size_t histogram_alloc_size;
        struct dm_stat_percpu *p;
        int cpu;
        int ret_id;
@@ -261,19 +272,34 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
        if (percpu_alloc_size / sizeof(struct dm_stat_percpu) != n_entries)
                return -EOVERFLOW;
 
-       if (!check_shared_memory(shared_alloc_size + num_possible_cpus() * percpu_alloc_size))
+       histogram_alloc_size = (n_histogram_entries + 1) * (size_t)n_entries * sizeof(unsigned long long);
+       if (histogram_alloc_size / (n_histogram_entries + 1) != (size_t)n_entries * sizeof(unsigned long long))
+               return -EOVERFLOW;
+
+       if (!check_shared_memory(shared_alloc_size + histogram_alloc_size +
+                                num_possible_cpus() * (percpu_alloc_size + histogram_alloc_size)))
                return -ENOMEM;
 
        s = dm_kvzalloc(shared_alloc_size, NUMA_NO_NODE);
        if (!s)
                return -ENOMEM;
 
+       s->stat_flags = stat_flags;
        s->n_entries = n_entries;
        s->start = start;
        s->end = end;
        s->step = step;
        s->shared_alloc_size = shared_alloc_size;
        s->percpu_alloc_size = percpu_alloc_size;
+       s->histogram_alloc_size = histogram_alloc_size;
+
+       s->n_histogram_entries = n_histogram_entries;
+       s->histogram_boundaries = kmemdup(histogram_boundaries,
+                                         s->n_histogram_entries * sizeof(unsigned long long), GFP_KERNEL);
+       if (!s->histogram_boundaries) {
+               r = -ENOMEM;
+               goto out;
+       }
 
        s->program_id = kstrdup(program_id, GFP_KERNEL);
        if (!s->program_id) {
@@ -291,6 +317,19 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
                atomic_set(&s->stat_shared[ni].in_flight[WRITE], 0);
        }
 
+       if (s->n_histogram_entries) {
+               unsigned long long *hi;
+               hi = dm_kvzalloc(s->histogram_alloc_size, NUMA_NO_NODE);
+               if (!hi) {
+                       r = -ENOMEM;
+                       goto out;
+               }
+               for (ni = 0; ni < n_entries; ni++) {
+                       s->stat_shared[ni].tmp.histogram = hi;
+                       hi += s->n_histogram_entries + 1;
+               }
+       }
+
        for_each_possible_cpu(cpu) {
                p = dm_kvzalloc(percpu_alloc_size, cpu_to_node(cpu));
                if (!p) {
@@ -298,6 +337,18 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
                        goto out;
                }
                s->stat_percpu[cpu] = p;
+               if (s->n_histogram_entries) {
+                       unsigned long long *hi;
+                       hi = dm_kvzalloc(s->histogram_alloc_size, cpu_to_node(cpu));
+                       if (!hi) {
+                               r = -ENOMEM;
+                               goto out;
+                       }
+                       for (ni = 0; ni < n_entries; ni++) {
+                               p[ni].histogram = hi;
+                               hi += s->n_histogram_entries + 1;
+                       }
+               }
        }
 
        /*
@@ -375,9 +426,11 @@ static int dm_stats_delete(struct dm_stats *stats, int id)
         * vfree can't be called from RCU callback
         */
        for_each_possible_cpu(cpu)
-               if (is_vmalloc_addr(s->stat_percpu))
+               if (is_vmalloc_addr(s->stat_percpu) ||
+                   is_vmalloc_addr(s->stat_percpu[cpu][0].histogram))
                        goto do_sync_free;
-       if (is_vmalloc_addr(s)) {
+       if (is_vmalloc_addr(s) ||
+           is_vmalloc_addr(s->stat_shared[0].tmp.histogram)) {
 do_sync_free:
                synchronize_rcu_expedited();
                dm_stat_free(&s->rcu_head);
@@ -417,18 +470,24 @@ static int dm_stats_list(struct dm_stats *stats, const char *program,
        return 1;
 }
 
-static void dm_stat_round(struct dm_stat_shared *shared, struct dm_stat_percpu *p)
+static void dm_stat_round(struct dm_stat *s, struct dm_stat_shared *shared,
+                         struct dm_stat_percpu *p)
 {
        /*
         * This is racy, but so is part_round_stats_single.
         */
-       unsigned long now = jiffies;
-       unsigned in_flight_read;
-       unsigned in_flight_write;
-       unsigned long difference = now - shared->stamp;
+       unsigned long long now, difference;
+       unsigned in_flight_read, in_flight_write;
+
+       if (likely(!(s->stat_flags & STAT_PRECISE_TIMESTAMPS)))
+               now = jiffies;
+       else
+               now = ktime_to_ns(ktime_get());
 
+       difference = now - shared->stamp;
        if (!difference)
                return;
+
        in_flight_read = (unsigned)atomic_read(&shared->in_flight[READ]);
        in_flight_write = (unsigned)atomic_read(&shared->in_flight[WRITE]);
        if (in_flight_read)
@@ -443,8 +502,9 @@ static void dm_stat_round(struct dm_stat_shared *shared, struct dm_stat_percpu *
 }
 
 static void dm_stat_for_entry(struct dm_stat *s, size_t entry,
-                             unsigned long bi_rw, sector_t len, bool merged,
-                             bool end, unsigned long duration)
+                             unsigned long bi_rw, sector_t len,
+                             struct dm_stats_aux *stats_aux, bool end,
+                             unsigned long duration_jiffies)
 {
        unsigned long idx = bi_rw & REQ_WRITE;
        struct dm_stat_shared *shared = &s->stat_shared[entry];
@@ -474,15 +534,35 @@ static void dm_stat_for_entry(struct dm_stat *s, size_t entry,
        p = &s->stat_percpu[smp_processor_id()][entry];
 
        if (!end) {
-               dm_stat_round(shared, p);
+               dm_stat_round(s, shared, p);
                atomic_inc(&shared->in_flight[idx]);
        } else {
-               dm_stat_round(shared, p);
+               unsigned long long duration;
+               dm_stat_round(s, shared, p);
                atomic_dec(&shared->in_flight[idx]);
                p->sectors[idx] += len;
                p->ios[idx] += 1;
-               p->merges[idx] += merged;
-               p->ticks[idx] += duration;
+               p->merges[idx] += stats_aux->merged;
+               if (!(s->stat_flags & STAT_PRECISE_TIMESTAMPS)) {
+                       p->ticks[idx] += duration_jiffies;
+                       duration = jiffies_to_msecs(duration_jiffies);
+               } else {
+                       p->ticks[idx] += stats_aux->duration_ns;
+                       duration = stats_aux->duration_ns;
+               }
+               if (s->n_histogram_entries) {
+                       unsigned lo = 0, hi = s->n_histogram_entries + 1;
+                       while (lo + 1 < hi) {
+                               unsigned mid = (lo + hi) / 2;
+                               if (s->histogram_boundaries[mid - 1] > duration) {
+                                       hi = mid;
+                               } else {
+                                       lo = mid;
+                               }
+
+                       }
+                       p->histogram[lo]++;
+               }
        }
 
 #if BITS_PER_LONG == 32
@@ -494,7 +574,7 @@ static void dm_stat_for_entry(struct dm_stat *s, size_t entry,
 
 static void __dm_stat_bio(struct dm_stat *s, unsigned long bi_rw,
                          sector_t bi_sector, sector_t end_sector,
-                         bool end, unsigned long duration,
+                         bool end, unsigned long duration_jiffies,
                          struct dm_stats_aux *stats_aux)
 {
        sector_t rel_sector, offset, todo, fragment_len;
@@ -523,7 +603,7 @@ static void __dm_stat_bio(struct dm_stat *s, unsigned long bi_rw,
                if (fragment_len > s->step - offset)
                        fragment_len = s->step - offset;
                dm_stat_for_entry(s, entry, bi_rw, fragment_len,
-                                 stats_aux->merged, end, duration);
+                                 stats_aux, end, duration_jiffies);
                todo -= fragment_len;
                entry++;
                offset = 0;
@@ -532,11 +612,13 @@ static void __dm_stat_bio(struct dm_stat *s, unsigned long bi_rw,
 
 void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw,
                         sector_t bi_sector, unsigned bi_sectors, bool end,
-                        unsigned long duration, struct dm_stats_aux *stats_aux)
+                        unsigned long duration_jiffies,
+                        struct dm_stats_aux *stats_aux)
 {
        struct dm_stat *s;
        sector_t end_sector;
        struct dm_stats_last_position *last;
+       bool got_precise_time;
 
        if (unlikely(!bi_sectors))
                return;
@@ -560,8 +642,17 @@ void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw,
 
        rcu_read_lock();
 
-       list_for_each_entry_rcu(s, &stats->list, list_entry)
-               __dm_stat_bio(s, bi_rw, bi_sector, end_sector, end, duration, stats_aux);
+       got_precise_time = false;
+       list_for_each_entry_rcu(s, &stats->list, list_entry) {
+               if (s->stat_flags & STAT_PRECISE_TIMESTAMPS && !got_precise_time) {
+                       if (!end)
+                               stats_aux->duration_ns = ktime_to_ns(ktime_get());
+                       else
+                               stats_aux->duration_ns = ktime_to_ns(ktime_get()) - stats_aux->duration_ns;
+                       got_precise_time = true;
+               }
+               __dm_stat_bio(s, bi_rw, bi_sector, end_sector, end, duration_jiffies, stats_aux);
+       }
 
        rcu_read_unlock();
 }
@@ -574,10 +665,25 @@ static void __dm_stat_init_temporary_percpu_totals(struct dm_stat_shared *shared
 
        local_irq_disable();
        p = &s->stat_percpu[smp_processor_id()][x];
-       dm_stat_round(shared, p);
+       dm_stat_round(s, shared, p);
        local_irq_enable();
 
-       memset(&shared->tmp, 0, sizeof(shared->tmp));
+       shared->tmp.sectors[READ] = 0;
+       shared->tmp.sectors[WRITE] = 0;
+       shared->tmp.ios[READ] = 0;
+       shared->tmp.ios[WRITE] = 0;
+       shared->tmp.merges[READ] = 0;
+       shared->tmp.merges[WRITE] = 0;
+       shared->tmp.ticks[READ] = 0;
+       shared->tmp.ticks[WRITE] = 0;
+       shared->tmp.io_ticks[READ] = 0;
+       shared->tmp.io_ticks[WRITE] = 0;
+       shared->tmp.io_ticks_total = 0;
+       shared->tmp.time_in_queue = 0;
+
+       if (s->n_histogram_entries)
+               memset(shared->tmp.histogram, 0, (s->n_histogram_entries + 1) * sizeof(unsigned long long));
+
        for_each_possible_cpu(cpu) {
                p = &s->stat_percpu[cpu][x];
                shared->tmp.sectors[READ] += ACCESS_ONCE(p->sectors[READ]);
@@ -592,6 +698,11 @@ static void __dm_stat_init_temporary_percpu_totals(struct dm_stat_shared *shared
                shared->tmp.io_ticks[WRITE] += ACCESS_ONCE(p->io_ticks[WRITE]);
                shared->tmp.io_ticks_total += ACCESS_ONCE(p->io_ticks_total);
                shared->tmp.time_in_queue += ACCESS_ONCE(p->time_in_queue);
+               if (s->n_histogram_entries) {
+                       unsigned i;
+                       for (i = 0; i < s->n_histogram_entries + 1; i++)
+                               shared->tmp.histogram[i] += ACCESS_ONCE(p->histogram[i]);
+               }
        }
 }
 
@@ -621,6 +732,15 @@ static void __dm_stat_clear(struct dm_stat *s, size_t idx_start, size_t idx_end,
                p->io_ticks_total -= shared->tmp.io_ticks_total;
                p->time_in_queue -= shared->tmp.time_in_queue;
                local_irq_enable();
+               if (s->n_histogram_entries) {
+                       unsigned i;
+                       for (i = 0; i < s->n_histogram_entries + 1; i++) {
+                               local_irq_disable();
+                               p = &s->stat_percpu[smp_processor_id()][x];
+                               p->histogram[i] -= shared->tmp.histogram[i];
+                               local_irq_enable();
+                       }
+               }
        }
 }
 
@@ -646,11 +766,15 @@ static int dm_stats_clear(struct dm_stats *stats, int id)
 /*
  * This is like jiffies_to_msec, but works for 64-bit values.
  */
-static unsigned long long dm_jiffies_to_msec64(unsigned long long j)
+static unsigned long long dm_jiffies_to_msec64(struct dm_stat *s, unsigned long long j)
 {
-       unsigned long long result = 0;
+       unsigned long long result;
        unsigned mult;
 
+       if (s->stat_flags & STAT_PRECISE_TIMESTAMPS)
+               return j;
+
+       result = 0;
        if (j)
                result = jiffies_to_msecs(j & 0x3fffff);
        if (j >= 1 << 22) {
@@ -706,22 +830,29 @@ static int dm_stats_print(struct dm_stats *stats, int id,
 
                __dm_stat_init_temporary_percpu_totals(shared, s, x);
 
-               DMEMIT("%llu+%llu %llu %llu %llu %llu %llu %llu %llu %llu %d %llu %llu %llu %llu\n",
+               DMEMIT("%llu+%llu %llu %llu %llu %llu %llu %llu %llu %llu %d %llu %llu %llu %llu",
                       (unsigned long long)start,
                       (unsigned long long)step,
                       shared->tmp.ios[READ],
                       shared->tmp.merges[READ],
                       shared->tmp.sectors[READ],
-                      dm_jiffies_to_msec64(shared->tmp.ticks[READ]),
+                      dm_jiffies_to_msec64(s, shared->tmp.ticks[READ]),
                       shared->tmp.ios[WRITE],
                       shared->tmp.merges[WRITE],
                       shared->tmp.sectors[WRITE],
-                      dm_jiffies_to_msec64(shared->tmp.ticks[WRITE]),
+                      dm_jiffies_to_msec64(s, shared->tmp.ticks[WRITE]),
                       dm_stat_in_flight(shared),
-                      dm_jiffies_to_msec64(shared->tmp.io_ticks_total),
-                      dm_jiffies_to_msec64(shared->tmp.time_in_queue),
-                      dm_jiffies_to_msec64(shared->tmp.io_ticks[READ]),
-                      dm_jiffies_to_msec64(shared->tmp.io_ticks[WRITE]));
+                      dm_jiffies_to_msec64(s, shared->tmp.io_ticks_total),
+                      dm_jiffies_to_msec64(s, shared->tmp.time_in_queue),
+                      dm_jiffies_to_msec64(s, shared->tmp.io_ticks[READ]),
+                      dm_jiffies_to_msec64(s, shared->tmp.io_ticks[WRITE]));
+               if (s->n_histogram_entries) {
+                       unsigned i;
+                       for (i = 0; i < s->n_histogram_entries + 1; i++) {
+                               DMEMIT("%s%llu", !i ? " " : ":", shared->tmp.histogram[i]);
+                       }
+               }
+               DMEMIT("\n");
 
                if (unlikely(sz + 1 >= maxlen))
                        goto buffer_overflow;
@@ -763,55 +894,134 @@ static int dm_stats_set_aux(struct dm_stats *stats, int id, const char *aux_data
        return 0;
 }
 
+static int parse_histogram(const char *h, unsigned *n_histogram_entries,
+                          unsigned long long **histogram_boundaries)
+{
+       const char *q;
+       unsigned n;
+       unsigned long long last;
+
+       *n_histogram_entries = 1;
+       for (q = h; *q; q++)
+               if (*q == ',')
+                       (*n_histogram_entries)++;
+
+       *histogram_boundaries = kmalloc(*n_histogram_entries * sizeof(unsigned long long), GFP_KERNEL);
+       if (!*histogram_boundaries)
+               return -ENOMEM;
+
+       n = 0;
+       last = 0;
+       while (1) {
+               unsigned long long hi;
+               int s;
+               char ch;
+               s = sscanf(h, "%llu%c", &hi, &ch);
+               if (!s || (s == 2 && ch != ','))
+                       return -EINVAL;
+               if (hi <= last)
+                       return -EINVAL;
+               last = hi;
+               (*histogram_boundaries)[n] = hi;
+               if (s == 1)
+                       return 0;
+               h = strchr(h, ',') + 1;
+               n++;
+       }
+}
+
 static int message_stats_create(struct mapped_device *md,
                                unsigned argc, char **argv,
                                char *result, unsigned maxlen)
 {
+       int r;
        int id;
        char dummy;
        unsigned long long start, end, len, step;
        unsigned divisor;
        const char *program_id, *aux_data;
+       unsigned stat_flags = 0;
+
+       unsigned n_histogram_entries = 0;
+       unsigned long long *histogram_boundaries = NULL;
+
+       struct dm_arg_set as, as_backup;
+       const char *a;
+       unsigned feature_args;
 
        /*
         * Input format:
-        *   <range> <step> [<program_id> [<aux_data>]]
+        *   <range> <step> [<extra_parameters> <parameters>] [<program_id> [<aux_data>]]
         */
 
-       if (argc < 3 || argc > 5)
-               return -EINVAL;
+       if (argc < 3)
+               goto ret_einval;
 
-       if (!strcmp(argv[1], "-")) {
+       as.argc = argc;
+       as.argv = argv;
+       dm_consume_args(&as, 1);
+
+       a = dm_shift_arg(&as);
+       if (!strcmp(a, "-")) {
                start = 0;
                len = dm_get_size(md);
                if (!len)
                        len = 1;
-       } else if (sscanf(argv[1], "%llu+%llu%c", &start, &len, &dummy) != 2 ||
+       } else if (sscanf(a, "%llu+%llu%c", &start, &len, &dummy) != 2 ||
                   start != (sector_t)start || len != (sector_t)len)
-               return -EINVAL;
+               goto ret_einval;
 
        end = start + len;
        if (start >= end)
-               return -EINVAL;
+               goto ret_einval;
 
-       if (sscanf(argv[2], "/%u%c", &divisor, &dummy) == 1) {
+       a = dm_shift_arg(&as);
+       if (sscanf(a, "/%u%c", &divisor, &dummy) == 1) {
+               if (!divisor)
+                       return -EINVAL;
                step = end - start;
                if (do_div(step, divisor))
                        step++;
                if (!step)
                        step = 1;
-       } else if (sscanf(argv[2], "%llu%c", &step, &dummy) != 1 ||
+       } else if (sscanf(a, "%llu%c", &step, &dummy) != 1 ||
                   step != (sector_t)step || !step)
-               return -EINVAL;
+               goto ret_einval;
+
+       as_backup = as;
+       a = dm_shift_arg(&as);
+       if (a && sscanf(a, "%u%c", &feature_args, &dummy) == 1) {
+               while (feature_args--) {
+                       a = dm_shift_arg(&as);
+                       if (!a)
+                               goto ret_einval;
+                       if (!strcasecmp(a, "precise_timestamps"))
+                               stat_flags |= STAT_PRECISE_TIMESTAMPS;
+                       else if (!strncasecmp(a, "histogram:", 10)) {
+                               if (n_histogram_entries)
+                                       goto ret_einval;
+                               if ((r = parse_histogram(a + 10, &n_histogram_entries, &histogram_boundaries)))
+                                       goto ret;
+                       } else
+                               goto ret_einval;
+               }
+       } else {
+               as = as_backup;
+       }
 
        program_id = "-";
        aux_data = "-";
 
-       if (argc > 3)
-               program_id = argv[3];
+       a = dm_shift_arg(&as);
+       if (a)
+               program_id = a;
 
-       if (argc > 4)
-               aux_data = argv[4];
+       a = dm_shift_arg(&as);
+       if (a)
+               aux_data = a;
+
+       if (as.argc)
+               goto ret_einval;
 
        /*
         * If a buffer overflow happens after we created the region,
@@ -820,17 +1030,29 @@ static int message_stats_create(struct mapped_device *md,
         * leaked).  So we must detect buffer overflow in advance.
         */
        snprintf(result, maxlen, "%d", INT_MAX);
-       if (dm_message_test_buffer_overflow(result, maxlen))
-               return 1;
+       if (dm_message_test_buffer_overflow(result, maxlen)) {
+               r = 1;
+               goto ret;
+       }
 
-       id = dm_stats_create(dm_get_stats(md), start, end, step, program_id, aux_data,
+       id = dm_stats_create(dm_get_stats(md), start, end, step, stat_flags,
+                            n_histogram_entries, histogram_boundaries, program_id, aux_data,
                             dm_internal_suspend_fast, dm_internal_resume_fast, md);
-       if (id < 0)
-               return id;
+       if (id < 0) {
+               r = id;
+               goto ret;
+       }
 
        snprintf(result, maxlen, "%d", id);
 
-       return 1;
+       r = 1;
+       goto ret;
+
+ret_einval:
+       r = -EINVAL;
+ret:
+       kfree(histogram_boundaries);
+       return r;
 }
 
 static int message_stats_delete(struct mapped_device *md,
@@ -933,11 +1155,6 @@ int dm_stats_message(struct mapped_device *md, unsigned argc, char **argv,
 {
        int r;
 
-       if (dm_request_based(md)) {
-               DMWARN("Statistics are only supported for bio-based devices");
-               return -EOPNOTSUPP;
-       }
-
        /* All messages here must start with '@' */
        if (!strcasecmp(argv[0], "@stats_create"))
                r = message_stats_create(md, argc, argv, result, maxlen);
index e7c4984..f1c0956 100644 (file)
@@ -18,6 +18,7 @@ struct dm_stats {
 
 struct dm_stats_aux {
        bool merged;
+       unsigned long long duration_ns;
 };
 
 void dm_stats_init(struct dm_stats *st);
@@ -30,7 +31,8 @@ int dm_stats_message(struct mapped_device *md, unsigned argc, char **argv,
 
 void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw,
                         sector_t bi_sector, unsigned bi_sectors, bool end,
-                        unsigned long duration, struct dm_stats_aux *aux);
+                        unsigned long duration_jiffies,
+                        struct dm_stats_aux *aux);
 
 static inline bool dm_stats_used(struct dm_stats *st)
 {
index f8b37d4..a672a15 100644 (file)
@@ -451,10 +451,8 @@ int __init dm_stripe_init(void)
        int r;
 
        r = dm_register_target(&stripe_target);
-       if (r < 0) {
+       if (r < 0)
                DMWARN("target registration failed");
-               return r;
-       }
 
        return r;
 }
index a5f9412..85e1d39 100644 (file)
@@ -964,8 +964,8 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *
                return -EINVAL;
        }
 
-       if (!t->mempools)
-               return -ENOMEM;
+       if (IS_ERR(t->mempools))
+               return PTR_ERR(t->mempools);
 
        return 0;
 }
index 79f6941..48dfe3c 100644 (file)
@@ -184,7 +184,6 @@ struct dm_pool_metadata {
        uint64_t trans_id;
        unsigned long flags;
        sector_t data_block_size;
-       bool read_only:1;
 
        /*
         * Set if a transaction has to be aborted but the attempt to roll back
@@ -836,7 +835,6 @@ struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
        init_rwsem(&pmd->root_lock);
        pmd->time = 0;
        INIT_LIST_HEAD(&pmd->thin_devices);
-       pmd->read_only = false;
        pmd->fail_io = false;
        pmd->bdev = bdev;
        pmd->data_block_size = data_block_size;
@@ -880,7 +878,7 @@ int dm_pool_metadata_close(struct dm_pool_metadata *pmd)
                return -EBUSY;
        }
 
-       if (!pmd->read_only && !pmd->fail_io) {
+       if (!dm_bm_is_read_only(pmd->bm) && !pmd->fail_io) {
                r = __commit_transaction(pmd);
                if (r < 0)
                        DMWARN("%s: __commit_transaction() failed, error = %d",
@@ -1392,10 +1390,11 @@ int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
        dm_block_t keys[2] = { td->id, block };
        struct dm_btree_info *info;
 
-       if (pmd->fail_io)
-               return -EINVAL;
-
        down_read(&pmd->root_lock);
+       if (pmd->fail_io) {
+               up_read(&pmd->root_lock);
+               return -EINVAL;
+       }
 
        if (can_issue_io) {
                info = &pmd->info;
@@ -1419,6 +1418,63 @@ int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
        return r;
 }
 
+/* FIXME: write a more efficient one in btree */
+int dm_thin_find_mapped_range(struct dm_thin_device *td,
+                             dm_block_t begin, dm_block_t end,
+                             dm_block_t *thin_begin, dm_block_t *thin_end,
+                             dm_block_t *pool_begin, bool *maybe_shared)
+{
+       int r;
+       dm_block_t pool_end;
+       struct dm_thin_lookup_result lookup;
+
+       if (end < begin)
+               return -ENODATA;
+
+       /*
+        * Find first mapped block.
+        */
+       while (begin < end) {
+               r = dm_thin_find_block(td, begin, true, &lookup);
+               if (r) {
+                       if (r != -ENODATA)
+                               return r;
+               } else
+                       break;
+
+               begin++;
+       }
+
+       if (begin == end)
+               return -ENODATA;
+
+       *thin_begin = begin;
+       *pool_begin = lookup.block;
+       *maybe_shared = lookup.shared;
+
+       begin++;
+       pool_end = *pool_begin + 1;
+       while (begin != end) {
+               r = dm_thin_find_block(td, begin, true, &lookup);
+               if (r) {
+                       if (r == -ENODATA)
+                               break;
+                       else
+                               return r;
+               }
+
+               if ((lookup.block != pool_end) ||
+                   (lookup.shared != *maybe_shared))
+                       break;
+
+               pool_end++;
+               begin++;
+       }
+
+       *thin_end = begin;
+       return 0;
+}
+
 static int __insert(struct dm_thin_device *td, dm_block_t block,
                    dm_block_t data_block)
 {
@@ -1471,6 +1527,47 @@ static int __remove(struct dm_thin_device *td, dm_block_t block)
        return 0;
 }
 
+static int __remove_range(struct dm_thin_device *td, dm_block_t begin, dm_block_t end)
+{
+       int r;
+       unsigned count;
+       struct dm_pool_metadata *pmd = td->pmd;
+       dm_block_t keys[1] = { td->id };
+       __le64 value;
+       dm_block_t mapping_root;
+
+       /*
+        * Find the mapping tree
+        */
+       r = dm_btree_lookup(&pmd->tl_info, pmd->root, keys, &value);
+       if (r)
+               return r;
+
+       /*
+        * Remove from the mapping tree, taking care to inc the
+        * ref count so it doesn't get deleted.
+        */
+       mapping_root = le64_to_cpu(value);
+       dm_tm_inc(pmd->tm, mapping_root);
+       r = dm_btree_remove(&pmd->tl_info, pmd->root, keys, &pmd->root);
+       if (r)
+               return r;
+
+       r = dm_btree_remove_leaves(&pmd->bl_info, mapping_root, &begin, end, &mapping_root, &count);
+       if (r)
+               return r;
+
+       td->mapped_blocks -= count;
+       td->changed = 1;
+
+       /*
+        * Reinsert the mapping tree.
+        */
+       value = cpu_to_le64(mapping_root);
+       __dm_bless_for_disk(&value);
+       return dm_btree_insert(&pmd->tl_info, pmd->root, keys, &value, &pmd->root);
+}
+
 int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block)
 {
        int r = -EINVAL;
@@ -1483,6 +1580,19 @@ int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block)
        return r;
 }
 
+int dm_thin_remove_range(struct dm_thin_device *td,
+                        dm_block_t begin, dm_block_t end)
+{
+       int r = -EINVAL;
+
+       down_write(&td->pmd->root_lock);
+       if (!td->pmd->fail_io)
+               r = __remove_range(td, begin, end);
+       up_write(&td->pmd->root_lock);
+
+       return r;
+}
+
 int dm_pool_block_is_used(struct dm_pool_metadata *pmd, dm_block_t b, bool *result)
 {
        int r;
@@ -1739,7 +1849,6 @@ int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_cou
 void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd)
 {
        down_write(&pmd->root_lock);
-       pmd->read_only = true;
        dm_bm_set_read_only(pmd->bm);
        up_write(&pmd->root_lock);
 }
@@ -1747,7 +1856,6 @@ void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd)
 void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd)
 {
        down_write(&pmd->root_lock);
-       pmd->read_only = false;
        dm_bm_set_read_write(pmd->bm);
        up_write(&pmd->root_lock);
 }
index fac01a9..a938bab 100644 (file)
@@ -147,6 +147,15 @@ int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
                       int can_issue_io, struct dm_thin_lookup_result *result);
 
 /*
+ * Retrieve the next run of contiguously mapped blocks.  Useful for working
+ * out where to break up IO.  Returns 0 on success, < 0 on error.
+ */
+int dm_thin_find_mapped_range(struct dm_thin_device *td,
+                             dm_block_t begin, dm_block_t end,
+                             dm_block_t *thin_begin, dm_block_t *thin_end,
+                             dm_block_t *pool_begin, bool *maybe_shared);
+
+/*
  * Obtain an unused block.
  */
 int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result);
@@ -158,6 +167,8 @@ int dm_thin_insert_block(struct dm_thin_device *td, dm_block_t block,
                         dm_block_t data_block);
 
 int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block);
+int dm_thin_remove_range(struct dm_thin_device *td,
+                        dm_block_t begin, dm_block_t end);
 
 /*
  * Queries.
index e852602..c33f61a 100644 (file)
@@ -111,22 +111,30 @@ DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
 /*
  * Key building.
  */
-static void build_data_key(struct dm_thin_device *td,
-                          dm_block_t b, struct dm_cell_key *key)
+enum lock_space {
+       VIRTUAL,
+       PHYSICAL
+};
+
+static void build_key(struct dm_thin_device *td, enum lock_space ls,
+                     dm_block_t b, dm_block_t e, struct dm_cell_key *key)
 {
-       key->virtual = 0;
+       key->virtual = (ls == VIRTUAL);
        key->dev = dm_thin_dev_id(td);
        key->block_begin = b;
-       key->block_end = b + 1ULL;
+       key->block_end = e;
+}
+
+static void build_data_key(struct dm_thin_device *td, dm_block_t b,
+                          struct dm_cell_key *key)
+{
+       build_key(td, PHYSICAL, b, b + 1llu, key);
 }
 
 static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
                              struct dm_cell_key *key)
 {
-       key->virtual = 1;
-       key->dev = dm_thin_dev_id(td);
-       key->block_begin = b;
-       key->block_end = b + 1ULL;
+       build_key(td, VIRTUAL, b, b + 1llu, key);
 }
 
 /*----------------------------------------------------------------*/
@@ -312,6 +320,138 @@ struct thin_c {
 
 /*----------------------------------------------------------------*/
 
+/**
+ * __blkdev_issue_discard_async - queue a discard with async completion
+ * @bdev:      blockdev to issue discard for
+ * @sector:    start sector
+ * @nr_sects:  number of sectors to discard
+ * @gfp_mask:  memory allocation flags (for bio_alloc)
+ * @flags:     BLKDEV_IFL_* flags to control behaviour
+ * @parent_bio: parent discard bio that all sub discards get chained to
+ *
+ * Description:
+ *    Asynchronously issue a discard request for the sectors in question.
+ *    NOTE: this variant of blk-core's blkdev_issue_discard() is a stop-gap
+ *    that is being kept local to DM thinp until the block changes to allow
+ *    late bio splitting land upstream.
+ */
+static int __blkdev_issue_discard_async(struct block_device *bdev, sector_t sector,
+                                       sector_t nr_sects, gfp_t gfp_mask, unsigned long flags,
+                                       struct bio *parent_bio)
+{
+       struct request_queue *q = bdev_get_queue(bdev);
+       int type = REQ_WRITE | REQ_DISCARD;
+       unsigned int max_discard_sectors, granularity;
+       int alignment;
+       struct bio *bio;
+       int ret = 0;
+       struct blk_plug plug;
+
+       if (!q)
+               return -ENXIO;
+
+       if (!blk_queue_discard(q))
+               return -EOPNOTSUPP;
+
+       /* Zero-sector (unknown) and one-sector granularities are the same.  */
+       granularity = max(q->limits.discard_granularity >> 9, 1U);
+       alignment = (bdev_discard_alignment(bdev) >> 9) % granularity;
+
+       /*
+        * Ensure that max_discard_sectors is of the proper
+        * granularity, so that requests stay aligned after a split.
+        */
+       max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9);
+       max_discard_sectors -= max_discard_sectors % granularity;
+       if (unlikely(!max_discard_sectors)) {
+               /* Avoid infinite loop below. Being cautious never hurts. */
+               return -EOPNOTSUPP;
+       }
+
+       if (flags & BLKDEV_DISCARD_SECURE) {
+               if (!blk_queue_secdiscard(q))
+                       return -EOPNOTSUPP;
+               type |= REQ_SECURE;
+       }
+
+       blk_start_plug(&plug);
+       while (nr_sects) {
+               unsigned int req_sects;
+               sector_t end_sect, tmp;
+
+               /*
+                * Required bio_put occurs in bio_endio thanks to bio_chain below
+                */
+               bio = bio_alloc(gfp_mask, 1);
+               if (!bio) {
+                       ret = -ENOMEM;
+                       break;
+               }
+
+               req_sects = min_t(sector_t, nr_sects, max_discard_sectors);
+
+               /*
+                * If splitting a request, and the next starting sector would be
+                * misaligned, stop the discard at the previous aligned sector.
+                */
+               end_sect = sector + req_sects;
+               tmp = end_sect;
+               if (req_sects < nr_sects &&
+                   sector_div(tmp, granularity) != alignment) {
+                       end_sect = end_sect - alignment;
+                       sector_div(end_sect, granularity);
+                       end_sect = end_sect * granularity + alignment;
+                       req_sects = end_sect - sector;
+               }
+
+               bio_chain(bio, parent_bio);
+
+               bio->bi_iter.bi_sector = sector;
+               bio->bi_bdev = bdev;
+
+               bio->bi_iter.bi_size = req_sects << 9;
+               nr_sects -= req_sects;
+               sector = end_sect;
+
+               submit_bio(type, bio);
+
+               /*
+                * We can loop for a long time in here, if someone does
+                * full device discards (like mkfs). Be nice and allow
+                * us to schedule out to avoid softlocking if preempt
+                * is disabled.
+                */
+               cond_resched();
+       }
+       blk_finish_plug(&plug);
+
+       return ret;
+}
+
+static bool block_size_is_power_of_two(struct pool *pool)
+{
+       return pool->sectors_per_block_shift >= 0;
+}
+
+static sector_t block_to_sectors(struct pool *pool, dm_block_t b)
+{
+       return block_size_is_power_of_two(pool) ?
+               (b << pool->sectors_per_block_shift) :
+               (b * pool->sectors_per_block);
+}
+
+static int issue_discard(struct thin_c *tc, dm_block_t data_b, dm_block_t data_e,
+                        struct bio *parent_bio)
+{
+       sector_t s = block_to_sectors(tc->pool, data_b);
+       sector_t len = block_to_sectors(tc->pool, data_e - data_b);
+
+       return __blkdev_issue_discard_async(tc->pool_dev->bdev, s, len,
+                                           GFP_NOWAIT, 0, parent_bio);
+}
+
+/*----------------------------------------------------------------*/
+
 /*
  * wake_worker() is used when new work is queued and when pool_resume is
  * ready to continue deferred IO processing.
@@ -461,6 +601,7 @@ struct dm_thin_endio_hook {
        struct dm_deferred_entry *all_io_entry;
        struct dm_thin_new_mapping *overwrite_mapping;
        struct rb_node rb_node;
+       struct dm_bio_prison_cell *cell;
 };
 
 static void __merge_bio_list(struct bio_list *bios, struct bio_list *master)
@@ -541,11 +682,6 @@ static void error_retry_list(struct pool *pool)
  * target.
  */
 
-static bool block_size_is_power_of_two(struct pool *pool)
-{
-       return pool->sectors_per_block_shift >= 0;
-}
-
 static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
 {
        struct pool *pool = tc->pool;
@@ -559,6 +695,34 @@ static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
        return block_nr;
 }
 
+/*
+ * Returns the _complete_ blocks that this bio covers.
+ */
+static void get_bio_block_range(struct thin_c *tc, struct bio *bio,
+                               dm_block_t *begin, dm_block_t *end)
+{
+       struct pool *pool = tc->pool;
+       sector_t b = bio->bi_iter.bi_sector;
+       sector_t e = b + (bio->bi_iter.bi_size >> SECTOR_SHIFT);
+
+       b += pool->sectors_per_block - 1ull; /* so we round up */
+
+       if (block_size_is_power_of_two(pool)) {
+               b >>= pool->sectors_per_block_shift;
+               e >>= pool->sectors_per_block_shift;
+       } else {
+               (void) sector_div(b, pool->sectors_per_block);
+               (void) sector_div(e, pool->sectors_per_block);
+       }
+
+       if (e < b)
+               /* Can happen if the bio is within a single block. */
+               e = b;
+
+       *begin = b;
+       *end = e;
+}
+
 static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
 {
        struct pool *pool = tc->pool;
@@ -647,7 +811,7 @@ struct dm_thin_new_mapping {
        struct list_head list;
 
        bool pass_discard:1;
-       bool definitely_not_shared:1;
+       bool maybe_shared:1;
 
        /*
         * Track quiescing, copying and zeroing preparation actions.  When this
@@ -658,9 +822,9 @@ struct dm_thin_new_mapping {
 
        int err;
        struct thin_c *tc;
-       dm_block_t virt_block;
+       dm_block_t virt_begin, virt_end;
        dm_block_t data_block;
-       struct dm_bio_prison_cell *cell, *cell2;
+       struct dm_bio_prison_cell *cell;
 
        /*
         * If the bio covers the whole area of a block then we can avoid
@@ -705,6 +869,8 @@ static void overwrite_endio(struct bio *bio, int err)
        struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
        struct dm_thin_new_mapping *m = h->overwrite_mapping;
 
+       bio->bi_end_io = m->saved_bi_end_io;
+
        m->err = err;
        complete_mapping_preparation(m);
 }
@@ -793,9 +959,6 @@ static void inc_remap_and_issue_cell(struct thin_c *tc,
 
 static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
 {
-       if (m->bio)
-               m->bio->bi_end_io = m->saved_bi_end_io;
-
        cell_error(m->tc->pool, m->cell);
        list_del(&m->list);
        mempool_free(m, m->tc->pool->mapping_pool);
@@ -805,13 +968,9 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
 {
        struct thin_c *tc = m->tc;
        struct pool *pool = tc->pool;
-       struct bio *bio;
+       struct bio *bio = m->bio;
        int r;
 
-       bio = m->bio;
-       if (bio)
-               bio->bi_end_io = m->saved_bi_end_io;
-
        if (m->err) {
                cell_error(pool, m->cell);
                goto out;
@@ -822,7 +981,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
         * Any I/O for this block arriving after this point will get
         * remapped to it directly.
         */
-       r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
+       r = dm_thin_insert_block(tc->td, m->virt_begin, m->data_block);
        if (r) {
                metadata_operation_failed(pool, "dm_thin_insert_block", r);
                cell_error(pool, m->cell);
@@ -849,50 +1008,112 @@ out:
        mempool_free(m, pool->mapping_pool);
 }
 
-static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
+/*----------------------------------------------------------------*/
+
+static void free_discard_mapping(struct dm_thin_new_mapping *m)
 {
        struct thin_c *tc = m->tc;
+       if (m->cell)
+               cell_defer_no_holder(tc, m->cell);
+       mempool_free(m, tc->pool->mapping_pool);
+}
 
+static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
+{
        bio_io_error(m->bio);
+       free_discard_mapping(m);
+}
+
+static void process_prepared_discard_success(struct dm_thin_new_mapping *m)
+{
+       bio_endio(m->bio, 0);
+       free_discard_mapping(m);
+}
+
+static void process_prepared_discard_no_passdown(struct dm_thin_new_mapping *m)
+{
+       int r;
+       struct thin_c *tc = m->tc;
+
+       r = dm_thin_remove_range(tc->td, m->cell->key.block_begin, m->cell->key.block_end);
+       if (r) {
+               metadata_operation_failed(tc->pool, "dm_thin_remove_range", r);
+               bio_io_error(m->bio);
+       } else
+               bio_endio(m->bio, 0);
+
        cell_defer_no_holder(tc, m->cell);
-       cell_defer_no_holder(tc, m->cell2);
        mempool_free(m, tc->pool->mapping_pool);
 }
 
-static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
+static int passdown_double_checking_shared_status(struct dm_thin_new_mapping *m)
 {
+       /*
+        * We've already unmapped this range of blocks, but before we
+        * passdown we have to check that these blocks are now unused.
+        */
+       int r;
+       bool used = true;
        struct thin_c *tc = m->tc;
+       struct pool *pool = tc->pool;
+       dm_block_t b = m->data_block, e, end = m->data_block + m->virt_end - m->virt_begin;
 
-       inc_all_io_entry(tc->pool, m->bio);
-       cell_defer_no_holder(tc, m->cell);
-       cell_defer_no_holder(tc, m->cell2);
+       while (b != end) {
+               /* find start of unmapped run */
+               for (; b < end; b++) {
+                       r = dm_pool_block_is_used(pool->pmd, b, &used);
+                       if (r)
+                               return r;
 
-       if (m->pass_discard)
-               if (m->definitely_not_shared)
-                       remap_and_issue(tc, m->bio, m->data_block);
-               else {
-                       bool used = false;
-                       if (dm_pool_block_is_used(tc->pool->pmd, m->data_block, &used) || used)
-                               bio_endio(m->bio, 0);
-                       else
-                               remap_and_issue(tc, m->bio, m->data_block);
+                       if (!used)
+                               break;
                }
-       else
-               bio_endio(m->bio, 0);
 
-       mempool_free(m, tc->pool->mapping_pool);
+               if (b == end)
+                       break;
+
+               /* find end of run */
+               for (e = b + 1; e != end; e++) {
+                       r = dm_pool_block_is_used(pool->pmd, e, &used);
+                       if (r)
+                               return r;
+
+                       if (used)
+                               break;
+               }
+
+               r = issue_discard(tc, b, e, m->bio);
+               if (r)
+                       return r;
+
+               b = e;
+       }
+
+       return 0;
 }
 
-static void process_prepared_discard(struct dm_thin_new_mapping *m)
+static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
 {
        int r;
        struct thin_c *tc = m->tc;
+       struct pool *pool = tc->pool;
 
-       r = dm_thin_remove_block(tc->td, m->virt_block);
+       r = dm_thin_remove_range(tc->td, m->virt_begin, m->virt_end);
        if (r)
-               DMERR_LIMIT("dm_thin_remove_block() failed");
+               metadata_operation_failed(pool, "dm_thin_remove_range", r);
+
+       else if (m->maybe_shared)
+               r = passdown_double_checking_shared_status(m);
+       else
+               r = issue_discard(tc, m->data_block, m->data_block + (m->virt_end - m->virt_begin), m->bio);
 
-       process_prepared_discard_passdown(m);
+       /*
+        * Even if r is set, there could be sub discards in flight that we
+        * need to wait for.
+        */
+       bio_endio(m->bio, r);
+       cell_defer_no_holder(tc, m->cell);
+       mempool_free(m, pool->mapping_pool);
 }
 
 static void process_prepared(struct pool *pool, struct list_head *head,
@@ -976,7 +1197,7 @@ static void ll_zero(struct thin_c *tc, struct dm_thin_new_mapping *m,
 }
 
 static void remap_and_issue_overwrite(struct thin_c *tc, struct bio *bio,
-                                     dm_block_t data_block,
+                                     dm_block_t data_begin,
                                      struct dm_thin_new_mapping *m)
 {
        struct pool *pool = tc->pool;
@@ -986,7 +1207,7 @@ static void remap_and_issue_overwrite(struct thin_c *tc, struct bio *bio,
        m->bio = bio;
        save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
        inc_all_io_entry(pool, bio);
-       remap_and_issue(tc, bio, data_block);
+       remap_and_issue(tc, bio, data_begin);
 }
 
 /*
@@ -1003,7 +1224,8 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
        struct dm_thin_new_mapping *m = get_next_mapping(pool);
 
        m->tc = tc;
-       m->virt_block = virt_block;
+       m->virt_begin = virt_block;
+       m->virt_end = virt_block + 1u;
        m->data_block = data_dest;
        m->cell = cell;
 
@@ -1082,7 +1304,8 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
 
        atomic_set(&m->prepare_actions, 1); /* no need to quiesce */
        m->tc = tc;
-       m->virt_block = virt_block;
+       m->virt_begin = virt_block;
+       m->virt_end = virt_block + 1u;
        m->data_block = data_block;
        m->cell = cell;
 
@@ -1091,16 +1314,14 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
         * zeroing pre-existing data, we can issue the bio immediately.
         * Otherwise we use kcopyd to zero the data first.
         */
-       if (!pool->pf.zero_new_blocks)
+       if (pool->pf.zero_new_blocks) {
+               if (io_overwrites_block(pool, bio))
+                       remap_and_issue_overwrite(tc, bio, data_block, m);
+               else
+                       ll_zero(tc, m, data_block * pool->sectors_per_block,
+                               (data_block + 1) * pool->sectors_per_block);
+       } else
                process_prepared_mapping(m);
-
-       else if (io_overwrites_block(pool, bio))
-               remap_and_issue_overwrite(tc, bio, data_block, m);
-
-       else
-               ll_zero(tc, m,
-                       data_block * pool->sectors_per_block,
-                       (data_block + 1) * pool->sectors_per_block);
 }
 
 static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
@@ -1291,99 +1512,149 @@ static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *c
                retry_on_resume(bio);
 }
 
-static void process_discard_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
+static void process_discard_cell_no_passdown(struct thin_c *tc,
+                                            struct dm_bio_prison_cell *virt_cell)
 {
-       int r;
-       struct bio *bio = cell->holder;
        struct pool *pool = tc->pool;
-       struct dm_bio_prison_cell *cell2;
-       struct dm_cell_key key2;
-       dm_block_t block = get_bio_block(tc, bio);
-       struct dm_thin_lookup_result lookup_result;
-       struct dm_thin_new_mapping *m;
+       struct dm_thin_new_mapping *m = get_next_mapping(pool);
 
-       if (tc->requeue_mode) {
-               cell_requeue(pool, cell);
-               return;
-       }
+       /*
+        * We don't need to lock the data blocks, since there's no
+        * passdown.  We only lock data blocks for allocation and breaking sharing.
+        */
+       m->tc = tc;
+       m->virt_begin = virt_cell->key.block_begin;
+       m->virt_end = virt_cell->key.block_end;
+       m->cell = virt_cell;
+       m->bio = virt_cell->holder;
 
-       r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
-       switch (r) {
-       case 0:
-               /*
-                * Check nobody is fiddling with this pool block.  This can
-                * happen if someone's in the process of breaking sharing
-                * on this block.
-                */
-               build_data_key(tc->td, lookup_result.block, &key2);
-               if (bio_detain(tc->pool, &key2, bio, &cell2)) {
-                       cell_defer_no_holder(tc, cell);
-                       break;
-               }
+       if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
+               pool->process_prepared_discard(m);
+}
 
-               if (io_overlaps_block(pool, bio)) {
-                       /*
-                        * IO may still be going to the destination block.  We must
-                        * quiesce before we can do the removal.
-                        */
-                       m = get_next_mapping(pool);
-                       m->tc = tc;
-                       m->pass_discard = pool->pf.discard_passdown;
-                       m->definitely_not_shared = !lookup_result.shared;
-                       m->virt_block = block;
-                       m->data_block = lookup_result.block;
-                       m->cell = cell;
-                       m->cell2 = cell2;
-                       m->bio = bio;
-
-                       if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
-                               pool->process_prepared_discard(m);
+/*
+ * FIXME: DM local hack to defer parent bios's end_io until we
+ * _know_ all chained sub range discard bios have completed.
+ * Will go away once late bio splitting lands upstream!
+ */
+static inline void __bio_inc_remaining(struct bio *bio)
+{
+       bio->bi_flags |= (1 << BIO_CHAIN);
+       smp_mb__before_atomic();
+       atomic_inc(&bio->__bi_remaining);
+}
 
-               } else {
-                       inc_all_io_entry(pool, bio);
-                       cell_defer_no_holder(tc, cell);
-                       cell_defer_no_holder(tc, cell2);
+static void break_up_discard_bio(struct thin_c *tc, dm_block_t begin, dm_block_t end,
+                                struct bio *bio)
+{
+       struct pool *pool = tc->pool;
 
+       int r;
+       bool maybe_shared;
+       struct dm_cell_key data_key;
+       struct dm_bio_prison_cell *data_cell;
+       struct dm_thin_new_mapping *m;
+       dm_block_t virt_begin, virt_end, data_begin;
+
+       while (begin != end) {
+               r = ensure_next_mapping(pool);
+               if (r)
+                       /* we did our best */
+                       return;
+
+               r = dm_thin_find_mapped_range(tc->td, begin, end, &virt_begin, &virt_end,
+                                             &data_begin, &maybe_shared);
+               if (r)
                        /*
-                        * The DM core makes sure that the discard doesn't span
-                        * a block boundary.  So we submit the discard of a
-                        * partial block appropriately.
+                        * Silently fail, letting any mappings we've
+                        * created complete.
                         */
-                       if ((!lookup_result.shared) && pool->pf.discard_passdown)
-                               remap_and_issue(tc, bio, lookup_result.block);
-                       else
-                               bio_endio(bio, 0);
+                       break;
+
+               build_key(tc->td, PHYSICAL, data_begin, data_begin + (virt_end - virt_begin), &data_key);
+               if (bio_detain(tc->pool, &data_key, NULL, &data_cell)) {
+                       /* contention, we'll give up with this range */
+                       begin = virt_end;
+                       continue;
                }
-               break;
 
-       case -ENODATA:
                /*
-                * It isn't provisioned, just forget it.
+                * IO may still be going to the destination block.  We must
+                * quiesce before we can do the removal.
                 */
-               cell_defer_no_holder(tc, cell);
-               bio_endio(bio, 0);
-               break;
+               m = get_next_mapping(pool);
+               m->tc = tc;
+               m->maybe_shared = maybe_shared;
+               m->virt_begin = virt_begin;
+               m->virt_end = virt_end;
+               m->data_block = data_begin;
+               m->cell = data_cell;
+               m->bio = bio;
 
-       default:
-               DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
-                           __func__, r);
-               cell_defer_no_holder(tc, cell);
-               bio_io_error(bio);
-               break;
+               /*
+                * The parent bio must not complete before sub discard bios are
+                * chained to it (see __blkdev_issue_discard_async's bio_chain)!
+                *
+                * This per-mapping bi_remaining increment is paired with
+                * the implicit decrement that occurs via bio_endio() in
+                * process_prepared_discard_{passdown,no_passdown}.
+                */
+               __bio_inc_remaining(bio);
+               if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
+                       pool->process_prepared_discard(m);
+
+               begin = virt_end;
        }
 }
 
+static void process_discard_cell_passdown(struct thin_c *tc, struct dm_bio_prison_cell *virt_cell)
+{
+       struct bio *bio = virt_cell->holder;
+       struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
+
+       /*
+        * The virt_cell will only get freed once the origin bio completes.
+        * This means it will remain locked while all the individual
+        * passdown bios are in flight.
+        */
+       h->cell = virt_cell;
+       break_up_discard_bio(tc, virt_cell->key.block_begin, virt_cell->key.block_end, bio);
+
+       /*
+        * We complete the bio now, knowing that the bi_remaining field
+        * will prevent completion until the sub range discards have
+        * completed.
+        */
+       bio_endio(bio, 0);
+}
+
 static void process_discard_bio(struct thin_c *tc, struct bio *bio)
 {
-       struct dm_bio_prison_cell *cell;
-       struct dm_cell_key key;
-       dm_block_t block = get_bio_block(tc, bio);
+       dm_block_t begin, end;
+       struct dm_cell_key virt_key;
+       struct dm_bio_prison_cell *virt_cell;
 
-       build_virtual_key(tc->td, block, &key);
-       if (bio_detain(tc->pool, &key, bio, &cell))
+       get_bio_block_range(tc, bio, &begin, &end);
+       if (begin == end) {
+               /*
+                * The discard covers less than a block.
+                */
+               bio_endio(bio, 0);
+               return;
+       }
+
+       build_key(tc->td, VIRTUAL, begin, end, &virt_key);
+       if (bio_detain(tc->pool, &virt_key, bio, &virt_cell))
+               /*
+                * Potential starvation issue: We're relying on the
+                * fs/application being well behaved, and not trying to
+                * send IO to a region at the same time as discarding it.
+                * If they do this persistently then it's possible this
+                * cell will never be granted.
+                */
                return;
 
-       process_discard_cell(tc, cell);
+       tc->pool->process_discard_cell(tc, virt_cell);
 }
 
 static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
@@ -2099,6 +2370,24 @@ static void notify_of_pool_mode_change(struct pool *pool, const char *new_mode)
               dm_device_name(pool->pool_md), new_mode);
 }
 
+static bool passdown_enabled(struct pool_c *pt)
+{
+       return pt->adjusted_pf.discard_passdown;
+}
+
+static void set_discard_callbacks(struct pool *pool)
+{
+       struct pool_c *pt = pool->ti->private;
+
+       if (passdown_enabled(pt)) {
+               pool->process_discard_cell = process_discard_cell_passdown;
+               pool->process_prepared_discard = process_prepared_discard_passdown;
+       } else {
+               pool->process_discard_cell = process_discard_cell_no_passdown;
+               pool->process_prepared_discard = process_prepared_discard_no_passdown;
+       }
+}
+
 static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
 {
        struct pool_c *pt = pool->ti->private;
@@ -2150,7 +2439,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
                pool->process_cell = process_cell_read_only;
                pool->process_discard_cell = process_cell_success;
                pool->process_prepared_mapping = process_prepared_mapping_fail;
-               pool->process_prepared_discard = process_prepared_discard_passdown;
+               pool->process_prepared_discard = process_prepared_discard_success;
 
                error_retry_list(pool);
                break;
@@ -2169,9 +2458,8 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
                pool->process_bio = process_bio_read_only;
                pool->process_discard = process_discard_bio;
                pool->process_cell = process_cell_read_only;
-               pool->process_discard_cell = process_discard_cell;
                pool->process_prepared_mapping = process_prepared_mapping;
-               pool->process_prepared_discard = process_prepared_discard;
+               set_discard_callbacks(pool);
 
                if (!pool->pf.error_if_no_space && no_space_timeout)
                        queue_delayed_work(pool->wq, &pool->no_space_timeout, no_space_timeout);
@@ -2184,9 +2472,8 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
                pool->process_bio = process_bio;
                pool->process_discard = process_discard_bio;
                pool->process_cell = process_cell;
-               pool->process_discard_cell = process_discard_cell;
                pool->process_prepared_mapping = process_prepared_mapping;
-               pool->process_prepared_discard = process_prepared_discard;
+               set_discard_callbacks(pool);
                break;
        }
 
@@ -2275,6 +2562,7 @@ static void thin_hook_bio(struct thin_c *tc, struct bio *bio)
        h->shared_read_entry = NULL;
        h->all_io_entry = NULL;
        h->overwrite_mapping = NULL;
+       h->cell = NULL;
 }
 
 /*
@@ -2422,7 +2710,6 @@ static void disable_passdown_if_not_supported(struct pool_c *pt)
        struct pool *pool = pt->pool;
        struct block_device *data_bdev = pt->data_dev->bdev;
        struct queue_limits *data_limits = &bdev_get_queue(data_bdev)->limits;
-       sector_t block_size = pool->sectors_per_block << SECTOR_SHIFT;
        const char *reason = NULL;
        char buf[BDEVNAME_SIZE];
 
@@ -2435,12 +2722,6 @@ static void disable_passdown_if_not_supported(struct pool_c *pt)
        else if (data_limits->max_discard_sectors < pool->sectors_per_block)
                reason = "max discard sectors smaller than a block";
 
-       else if (data_limits->discard_granularity > block_size)
-               reason = "discard granularity larger than a block";
-
-       else if (!is_factor(block_size, data_limits->discard_granularity))
-               reason = "discard granularity not a factor of block size";
-
        if (reason) {
                DMWARN("Data device (%s) %s: Disabling discard passdown.", bdevname(data_bdev, buf), reason);
                pt->adjusted_pf.discard_passdown = false;
@@ -3375,7 +3656,7 @@ static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
        if (get_pool_mode(pool) >= PM_READ_ONLY) {
                DMERR("%s: unable to service pool target messages in READ_ONLY or FAIL mode",
                      dm_device_name(pool->pool_md));
-               return -EINVAL;
+               return -EOPNOTSUPP;
        }
 
        if (!strcasecmp(argv[0], "create_thin"))
@@ -3573,24 +3854,6 @@ static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
        return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
 }
 
-static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits)
-{
-       struct pool *pool = pt->pool;
-       struct queue_limits *data_limits;
-
-       limits->max_discard_sectors = pool->sectors_per_block;
-
-       /*
-        * discard_granularity is just a hint, and not enforced.
-        */
-       if (pt->adjusted_pf.discard_passdown) {
-               data_limits = &bdev_get_queue(pt->data_dev->bdev)->limits;
-               limits->discard_granularity = max(data_limits->discard_granularity,
-                                                 pool->sectors_per_block << SECTOR_SHIFT);
-       } else
-               limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
-}
-
 static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
 {
        struct pool_c *pt = ti->private;
@@ -3645,14 +3908,17 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
 
        disable_passdown_if_not_supported(pt);
 
-       set_discard_limits(pt, limits);
+       /*
+        * The pool uses the same discard limits as the underlying data
+        * device.  DM core has already set this up.
+        */
 }
 
 static struct target_type pool_target = {
        .name = "thin-pool",
        .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
                    DM_TARGET_IMMUTABLE,
-       .version = {1, 14, 0},
+       .version = {1, 15, 0},
        .module = THIS_MODULE,
        .ctr = pool_ctr,
        .dtr = pool_dtr,
@@ -3811,8 +4077,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
        if (tc->pool->pf.discard_enabled) {
                ti->discards_supported = true;
                ti->num_discard_bios = 1;
-               /* Discard bios must be split on a block boundary */
-               ti->split_discard_bios = true;
+               ti->split_discard_bios = false;
        }
 
        mutex_unlock(&dm_thin_pool_table.mutex);
@@ -3899,6 +4164,9 @@ static int thin_endio(struct dm_target *ti, struct bio *bio, int err)
                }
        }
 
+       if (h->cell)
+               cell_defer_no_holder(h->tc, h->cell);
+
        return 0;
 }
 
@@ -4026,9 +4294,18 @@ static int thin_iterate_devices(struct dm_target *ti,
        return 0;
 }
 
+static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
+{
+       struct thin_c *tc = ti->private;
+       struct pool *pool = tc->pool;
+
+       limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
+       limits->max_discard_sectors = 2048 * 1024 * 16; /* 16G */
+}
+
 static struct target_type thin_target = {
        .name = "thin",
-       .version = {1, 14, 0},
+       .version = {1, 15, 0},
        .module = THIS_MODULE,
        .ctr = thin_ctr,
        .dtr = thin_dtr,
@@ -4040,6 +4317,7 @@ static struct target_type thin_target = {
        .status = thin_status,
        .merge = thin_merge,
        .iterate_devices = thin_iterate_devices,
+       .io_hints = thin_io_hints,
 };
 
 /*----------------------------------------------------------------*/
index d728299..2fe0992 100644 (file)
@@ -86,6 +86,9 @@ struct dm_rq_target_io {
        struct kthread_work work;
        int error;
        union map_info info;
+       struct dm_stats_aux stats_aux;
+       unsigned long duration_jiffies;
+       unsigned n_sectors;
 };
 
 /*
@@ -995,6 +998,17 @@ static struct dm_rq_target_io *tio_from_request(struct request *rq)
        return (rq->q->mq_ops ? blk_mq_rq_to_pdu(rq) : rq->special);
 }
 
+static void rq_end_stats(struct mapped_device *md, struct request *orig)
+{
+       if (unlikely(dm_stats_used(&md->stats))) {
+               struct dm_rq_target_io *tio = tio_from_request(orig);
+               tio->duration_jiffies = jiffies - tio->duration_jiffies;
+               dm_stats_account_io(&md->stats, orig->cmd_flags, blk_rq_pos(orig),
+                                   tio->n_sectors, true, tio->duration_jiffies,
+                                   &tio->stats_aux);
+       }
+}
+
 /*
  * Don't touch any member of the md after calling this function because
  * the md may be freed in dm_put() at the end of this function.
@@ -1078,6 +1092,7 @@ static void dm_end_request(struct request *clone, int error)
        }
 
        free_rq_clone(clone);
+       rq_end_stats(md, rq);
        if (!rq->q->mq_ops)
                blk_end_request_all(rq, error);
        else
@@ -1113,13 +1128,14 @@ static void old_requeue_request(struct request *rq)
        spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
-static void dm_requeue_unmapped_original_request(struct mapped_device *md,
-                                                struct request *rq)
+static void dm_requeue_original_request(struct mapped_device *md,
+                                       struct request *rq)
 {
        int rw = rq_data_dir(rq);
 
        dm_unprep_request(rq);
 
+       rq_end_stats(md, rq);
        if (!rq->q->mq_ops)
                old_requeue_request(rq);
        else {
@@ -1130,13 +1146,6 @@ static void dm_requeue_unmapped_original_request(struct mapped_device *md,
        rq_completed(md, rw, false);
 }
 
-static void dm_requeue_unmapped_request(struct request *clone)
-{
-       struct dm_rq_target_io *tio = clone->end_io_data;
-
-       dm_requeue_unmapped_original_request(tio->md, tio->orig);
-}
-
 static void old_stop_queue(struct request_queue *q)
 {
        unsigned long flags;
@@ -1200,7 +1209,7 @@ static void dm_done(struct request *clone, int error, bool mapped)
                return;
        else if (r == DM_ENDIO_REQUEUE)
                /* The target wants to requeue the I/O */
-               dm_requeue_unmapped_request(clone);
+               dm_requeue_original_request(tio->md, tio->orig);
        else {
                DMWARN("unimplemented target endio return value: %d", r);
                BUG();
@@ -1218,6 +1227,7 @@ static void dm_softirq_done(struct request *rq)
        int rw;
 
        if (!clone) {
+               rq_end_stats(tio->md, rq);
                rw = rq_data_dir(rq);
                if (!rq->q->mq_ops) {
                        blk_end_request_all(rq, tio->error);
@@ -1910,7 +1920,7 @@ static int map_request(struct dm_rq_target_io *tio, struct request *rq,
                break;
        case DM_MAPIO_REQUEUE:
                /* The target wants to requeue the I/O */
-               dm_requeue_unmapped_request(clone);
+               dm_requeue_original_request(md, tio->orig);
                break;
        default:
                if (r > 0) {
@@ -1933,7 +1943,7 @@ static void map_tio_request(struct kthread_work *work)
        struct mapped_device *md = tio->md;
 
        if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE)
-               dm_requeue_unmapped_original_request(md, rq);
+               dm_requeue_original_request(md, rq);
 }
 
 static void dm_start_request(struct mapped_device *md, struct request *orig)
@@ -1950,6 +1960,14 @@ static void dm_start_request(struct mapped_device *md, struct request *orig)
                md->last_rq_start_time = ktime_get();
        }
 
+       if (unlikely(dm_stats_used(&md->stats))) {
+               struct dm_rq_target_io *tio = tio_from_request(orig);
+               tio->duration_jiffies = jiffies;
+               tio->n_sectors = blk_rq_sectors(orig);
+               dm_stats_account_io(&md->stats, orig->cmd_flags, blk_rq_pos(orig),
+                                   tio->n_sectors, false, 0, &tio->stats_aux);
+       }
+
        /*
         * Hold the md reference here for the in-flight I/O.
         * We can't rely on the reference count by device opener,
@@ -2173,6 +2191,40 @@ static void dm_init_old_md_queue(struct mapped_device *md)
        blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
 }
 
+static void cleanup_mapped_device(struct mapped_device *md)
+{
+       cleanup_srcu_struct(&md->io_barrier);
+
+       if (md->wq)
+               destroy_workqueue(md->wq);
+       if (md->kworker_task)
+               kthread_stop(md->kworker_task);
+       if (md->io_pool)
+               mempool_destroy(md->io_pool);
+       if (md->rq_pool)
+               mempool_destroy(md->rq_pool);
+       if (md->bs)
+               bioset_free(md->bs);
+
+       if (md->disk) {
+               spin_lock(&_minor_lock);
+               md->disk->private_data = NULL;
+               spin_unlock(&_minor_lock);
+               if (blk_get_integrity(md->disk))
+                       blk_integrity_unregister(md->disk);
+               del_gendisk(md->disk);
+               put_disk(md->disk);
+       }
+
+       if (md->queue)
+               blk_cleanup_queue(md->queue);
+
+       if (md->bdev) {
+               bdput(md->bdev);
+               md->bdev = NULL;
+       }
+}
+
 /*
  * Allocate and initialise a blank device with a given minor.
  */
@@ -2218,13 +2270,13 @@ static struct mapped_device *alloc_dev(int minor)
 
        md->queue = blk_alloc_queue(GFP_KERNEL);
        if (!md->queue)
-               goto bad_queue;
+               goto bad;
 
        dm_init_md_queue(md);
 
        md->disk = alloc_disk(1);
        if (!md->disk)
-               goto bad_disk;
+               goto bad;
 
        atomic_set(&md->pending[0], 0);
        atomic_set(&md->pending[1], 0);
@@ -2245,11 +2297,11 @@ static struct mapped_device *alloc_dev(int minor)
 
        md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0);
        if (!md->wq)
-               goto bad_thread;
+               goto bad;
 
        md->bdev = bdget_disk(md->disk, 0);
        if (!md->bdev)
-               goto bad_bdev;
+               goto bad;
 
        bio_init(&md->flush_bio);
        md->flush_bio.bi_bdev = md->bdev;
@@ -2266,15 +2318,8 @@ static struct mapped_device *alloc_dev(int minor)
 
        return md;
 
-bad_bdev:
-       destroy_workqueue(md->wq);
-bad_thread:
-       del_gendisk(md->disk);
-       put_disk(md->disk);
-bad_disk:
-       blk_cleanup_queue(md->queue);
-bad_queue:
-       cleanup_srcu_struct(&md->io_barrier);
+bad:
+       cleanup_mapped_device(md);
 bad_io_barrier:
        free_minor(minor);
 bad_minor:
@@ -2291,71 +2336,65 @@ static void free_dev(struct mapped_device *md)
        int minor = MINOR(disk_devt(md->disk));
 
        unlock_fs(md);
-       destroy_workqueue(md->wq);
 
-       if (md->kworker_task)
-               kthread_stop(md->kworker_task);
-       if (md->io_pool)
-               mempool_destroy(md->io_pool);
-       if (md->rq_pool)
-               mempool_destroy(md->rq_pool);
-       if (md->bs)
-               bioset_free(md->bs);
+       cleanup_mapped_device(md);
+       if (md->use_blk_mq)
+               blk_mq_free_tag_set(&md->tag_set);
 
-       cleanup_srcu_struct(&md->io_barrier);
        free_table_devices(&md->table_devices);
        dm_stats_cleanup(&md->stats);
-
-       spin_lock(&_minor_lock);
-       md->disk->private_data = NULL;
-       spin_unlock(&_minor_lock);
-       if (blk_get_integrity(md->disk))
-               blk_integrity_unregister(md->disk);
-       del_gendisk(md->disk);
-       put_disk(md->disk);
-       blk_cleanup_queue(md->queue);
-       if (md->use_blk_mq)
-               blk_mq_free_tag_set(&md->tag_set);
-       bdput(md->bdev);
        free_minor(minor);
 
        module_put(THIS_MODULE);
        kfree(md);
 }
 
+static unsigned filter_md_type(unsigned type, struct mapped_device *md)
+{
+       if (type == DM_TYPE_BIO_BASED)
+               return type;
+
+       return !md->use_blk_mq ? DM_TYPE_REQUEST_BASED : DM_TYPE_MQ_REQUEST_BASED;
+}
+
 static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
 {
        struct dm_md_mempools *p = dm_table_get_md_mempools(t);
 
-       if (md->bs) {
-               /* The md already has necessary mempools. */
-               if (dm_table_get_type(t) == DM_TYPE_BIO_BASED) {
+       switch (filter_md_type(dm_table_get_type(t), md)) {
+       case DM_TYPE_BIO_BASED:
+               if (md->bs && md->io_pool) {
                        /*
+                        * This bio-based md already has necessary mempools.
                         * Reload bioset because front_pad may have changed
                         * because a different table was loaded.
                         */
                        bioset_free(md->bs);
                        md->bs = p->bs;
                        p->bs = NULL;
+                       goto out;
                }
-               /*
-                * There's no need to reload with request-based dm
-                * because the size of front_pad doesn't change.
-                * Note for future: If you are to reload bioset,
-                * prep-ed requests in the queue may refer
-                * to bio from the old bioset, so you must walk
-                * through the queue to unprep.
-                */
-               goto out;
+               break;
+       case DM_TYPE_REQUEST_BASED:
+               if (md->rq_pool && md->io_pool)
+                       /*
+                        * This request-based md already has necessary mempools.
+                        */
+                       goto out;
+               break;
+       case DM_TYPE_MQ_REQUEST_BASED:
+               BUG_ON(p); /* No mempools needed */
+               return;
        }
 
+       BUG_ON(!p || md->io_pool || md->rq_pool || md->bs);
+
        md->io_pool = p->io_pool;
        p->io_pool = NULL;
        md->rq_pool = p->rq_pool;
        p->rq_pool = NULL;
        md->bs = p->bs;
        p->bs = NULL;
-
 out:
        /* mempool bind completed, no longer need any mempools in the table */
        dm_table_free_md_mempools(t);
@@ -2675,6 +2714,7 @@ static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
                /* Direct call is fine since .queue_rq allows allocations */
                if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE) {
                        /* Undo dm_start_request() before requeuing */
+                       rq_end_stats(md, rq);
                        rq_completed(md, rq_data_dir(rq), false);
                        return BLK_MQ_RQ_QUEUE_BUSY;
                }
@@ -2734,14 +2774,6 @@ out_tag_set:
        return err;
 }
 
-static unsigned filter_md_type(unsigned type, struct mapped_device *md)
-{
-       if (type == DM_TYPE_BIO_BASED)
-               return type;
-
-       return !md->use_blk_mq ? DM_TYPE_REQUEST_BASED : DM_TYPE_MQ_REQUEST_BASED;
-}
-
 /*
  * Setup the DM device's queue based on md's type
  */
@@ -3463,7 +3495,7 @@ struct dm_md_mempools *dm_alloc_bio_mempools(unsigned integrity,
 
        pools = kzalloc(sizeof(*pools), GFP_KERNEL);
        if (!pools)
-               return NULL;
+               return ERR_PTR(-ENOMEM);
 
        front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) +
                offsetof(struct dm_target_io, clone);
@@ -3482,24 +3514,26 @@ struct dm_md_mempools *dm_alloc_bio_mempools(unsigned integrity,
        return pools;
 out:
        dm_free_md_mempools(pools);
-       return NULL;
+       return ERR_PTR(-ENOMEM);
 }
 
 struct dm_md_mempools *dm_alloc_rq_mempools(struct mapped_device *md,
                                            unsigned type)
 {
-       unsigned int pool_size = dm_get_reserved_rq_based_ios();
+       unsigned int pool_size;
        struct dm_md_mempools *pools;
 
+       if (filter_md_type(type, md) == DM_TYPE_MQ_REQUEST_BASED)
+               return NULL; /* No mempools needed */
+
+       pool_size = dm_get_reserved_rq_based_ios();
        pools = kzalloc(sizeof(*pools), GFP_KERNEL);
        if (!pools)
-               return NULL;
+               return ERR_PTR(-ENOMEM);
 
-       if (filter_md_type(type, md) == DM_TYPE_REQUEST_BASED) {
-               pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache);
-               if (!pools->rq_pool)
-                       goto out;
-       }
+       pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache);
+       if (!pools->rq_pool)
+               goto out;
 
        pools->io_pool = mempool_create_slab_pool(pool_size, _rq_tio_cache);
        if (!pools->io_pool)
@@ -3508,7 +3542,7 @@ struct dm_md_mempools *dm_alloc_rq_mempools(struct mapped_device *md,
        return pools;
 out:
        dm_free_md_mempools(pools);
-       return NULL;
+       return ERR_PTR(-ENOMEM);
 }
 
 void dm_free_md_mempools(struct dm_md_mempools *pools)
index 087411c..4d6c9b6 100644 (file)
@@ -609,6 +609,12 @@ void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b)
        dm_bufio_prefetch(bm->bufio, b, 1);
 }
 
+bool dm_bm_is_read_only(struct dm_block_manager *bm)
+{
+       return bm->read_only;
+}
+EXPORT_SYMBOL_GPL(dm_bm_is_read_only);
+
 void dm_bm_set_read_only(struct dm_block_manager *bm)
 {
        bm->read_only = true;
index 1b95dfc..84330f5 100644 (file)
@@ -123,6 +123,7 @@ void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b);
  * Additionally you should not use dm_bm_unlock_move, however no error will
  * be returned if you do.
  */
+bool dm_bm_is_read_only(struct dm_block_manager *bm);
 void dm_bm_set_read_only(struct dm_block_manager *bm);
 void dm_bm_set_read_write(struct dm_block_manager *bm);
 
index b88757c..e04cfd2 100644 (file)
@@ -590,3 +590,130 @@ int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
        return r;
 }
 EXPORT_SYMBOL_GPL(dm_btree_remove);
+
+/*----------------------------------------------------------------*/
+
+static int remove_nearest(struct shadow_spine *s, struct dm_btree_info *info,
+                         struct dm_btree_value_type *vt, dm_block_t root,
+                         uint64_t key, int *index)
+{
+       int i = *index, r;
+       struct btree_node *n;
+
+       for (;;) {
+               r = shadow_step(s, root, vt);
+               if (r < 0)
+                       break;
+
+               /*
+                * We have to patch up the parent node, ugly, but I don't
+                * see a way to do this automatically as part of the spine
+                * op.
+                */
+               if (shadow_has_parent(s)) {
+                       __le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
+                       memcpy(value_ptr(dm_block_data(shadow_parent(s)), i),
+                              &location, sizeof(__le64));
+               }
+
+               n = dm_block_data(shadow_current(s));
+
+               if (le32_to_cpu(n->header.flags) & LEAF_NODE) {
+                       *index = lower_bound(n, key);
+                       return 0;
+               }
+
+               r = rebalance_children(s, info, vt, key);
+               if (r)
+                       break;
+
+               n = dm_block_data(shadow_current(s));
+               if (le32_to_cpu(n->header.flags) & LEAF_NODE) {
+                       *index = lower_bound(n, key);
+                       return 0;
+               }
+
+               i = lower_bound(n, key);
+
+               /*
+                * We know the key is present, or else
+                * rebalance_children would have returned
+                * -ENODATA
+                */
+               root = value64(n, i);
+       }
+
+       return r;
+}
+
+static int remove_one(struct dm_btree_info *info, dm_block_t root,
+                     uint64_t *keys, uint64_t end_key,
+                     dm_block_t *new_root, unsigned *nr_removed)
+{
+       unsigned level, last_level = info->levels - 1;
+       int index = 0, r = 0;
+       struct shadow_spine spine;
+       struct btree_node *n;
+       uint64_t k;
+
+       init_shadow_spine(&spine, info);
+       for (level = 0; level < last_level; level++) {
+               r = remove_raw(&spine, info, &le64_type,
+                              root, keys[level], (unsigned *) &index);
+               if (r < 0)
+                       goto out;
+
+               n = dm_block_data(shadow_current(&spine));
+               root = value64(n, index);
+       }
+
+       r = remove_nearest(&spine, info, &info->value_type,
+                          root, keys[last_level], &index);
+       if (r < 0)
+               goto out;
+
+       n = dm_block_data(shadow_current(&spine));
+
+       if (index < 0)
+               index = 0;
+
+       if (index >= le32_to_cpu(n->header.nr_entries)) {
+               r = -ENODATA;
+               goto out;
+       }
+
+       k = le64_to_cpu(n->keys[index]);
+       if (k >= keys[last_level] && k < end_key) {
+               if (info->value_type.dec)
+                       info->value_type.dec(info->value_type.context,
+                                            value_ptr(n, index));
+
+               delete_at(n, index);
+
+       } else
+               r = -ENODATA;
+
+out:
+       *new_root = shadow_root(&spine);
+       exit_shadow_spine(&spine);
+
+       return r;
+}
+
+int dm_btree_remove_leaves(struct dm_btree_info *info, dm_block_t root,
+                          uint64_t *first_key, uint64_t end_key,
+                          dm_block_t *new_root, unsigned *nr_removed)
+{
+       int r;
+
+       *nr_removed = 0;
+       do {
+               r = remove_one(info, root, first_key, end_key, &root, nr_removed);
+               if (!r)
+                       (*nr_removed)++;
+       } while (!r);
+
+       *new_root = root;
+       return r == -ENODATA ? 0 : r;
+}
+EXPORT_SYMBOL_GPL(dm_btree_remove_leaves);
index dacfc34..11d8cf7 100644 (file)
@@ -135,6 +135,15 @@ int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
                    uint64_t *keys, dm_block_t *new_root);
 
 /*
+ * Removes values between 'keys' and keys2, where keys2 is keys with the
+ * final key replaced with 'end_key'.  'end_key' is the one-past-the-end
+ * value.  'keys' may be altered.
+ */
+int dm_btree_remove_leaves(struct dm_btree_info *info, dm_block_t root,
+                          uint64_t *keys, uint64_t end_key,
+                          dm_block_t *new_root, unsigned *nr_removed);
+
+/*
  * Returns < 0 on failure.  Otherwise the number of key entries that have
  * been filled out.  Remember trees can have zero entries, and as such have
  * no lowest key.
index e8a9042..5309129 100644 (file)
@@ -204,6 +204,27 @@ static void in(struct sm_metadata *smm)
        smm->recursion_count++;
 }
 
+static int apply_bops(struct sm_metadata *smm)
+{
+       int r = 0;
+
+       while (!brb_empty(&smm->uncommitted)) {
+               struct block_op bop;
+
+               r = brb_pop(&smm->uncommitted, &bop);
+               if (r) {
+                       DMERR("bug in bop ring buffer");
+                       break;
+               }
+
+               r = commit_bop(smm, &bop);
+               if (r)
+                       break;
+       }
+
+       return r;
+}
+
 static int out(struct sm_metadata *smm)
 {
        int r = 0;
@@ -216,21 +237,8 @@ static int out(struct sm_metadata *smm)
                return -ENOMEM;
        }
 
-       if (smm->recursion_count == 1) {
-               while (!brb_empty(&smm->uncommitted)) {
-                       struct block_op bop;
-
-                       r = brb_pop(&smm->uncommitted, &bop);
-                       if (r) {
-                               DMERR("bug in bop ring buffer");
-                               break;
-                       }
-
-                       r = commit_bop(smm, &bop);
-                       if (r)
-                               break;
-               }
-       }
+       if (smm->recursion_count == 1)
+               apply_bops(smm);
 
        smm->recursion_count--;
 
@@ -704,6 +712,12 @@ static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
                }
                old_len = smm->begin;
 
+               r = apply_bops(smm);
+               if (r) {
+                       DMERR("%s: apply_bops failed", __func__);
+                       goto out;
+               }
+
                r = sm_ll_commit(&smm->ll);
                if (r)
                        goto out;
@@ -773,6 +787,12 @@ int dm_sm_metadata_create(struct dm_space_map *sm,
        if (r)
                return r;
 
+       r = apply_bops(smm);
+       if (r) {
+               DMERR("%s: apply_bops failed", __func__);
+               return r;
+       }
+
        return sm_metadata_commit(sm);
 }