Merge branch 'for-2.6.40/core' of git://git.kernel.dk/linux-2.6-block

author Linus Torvalds <torvalds@linux-foundation.org>

Wed, 25 May 2011 16:14:07 +0000 (09:14 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Wed, 25 May 2011 16:14:07 +0000 (09:14 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Wed, 25 May 2011 16:14:07 +0000 (09:14 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Wed, 25 May 2011 16:14:07 +0000 (09:14 -0700)
diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block

index 4873c75..c1eb41c 100644 (file)
--- a/Documentation/ABI/testing/sysfs-block
+++ b/Documentation/ABI/testing/sysfs-block
@@ -142,3 +142,67 @@ Description:
                 with the previous I/O request are enabled. When set to 2,
                 all merge tries are disabled. The default value is 0 -
                 which enables all types of merge tries.
+
+What:          /sys/block/<disk>/discard_alignment
+Date:          May 2011
+Contact:       Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+               Devices that support discard functionality may
+               internally allocate space in units that are bigger than
+               the exported logical block size. The discard_alignment
+               parameter indicates how many bytes the beginning of the
+               device is offset from the internal allocation unit's
+               natural alignment.
+
+What:          /sys/block/<disk>/<partition>/discard_alignment
+Date:          May 2011
+Contact:       Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+               Devices that support discard functionality may
+               internally allocate space in units that are bigger than
+               the exported logical block size. The discard_alignment
+               parameter indicates how many bytes the beginning of the
+               partition is offset from the internal allocation unit's
+               natural alignment.
+
+What:          /sys/block/<disk>/queue/discard_granularity
+Date:          May 2011
+Contact:       Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+               Devices that support discard functionality may
+               internally allocate space using units that are bigger
+               than the logical block size. The discard_granularity
+               parameter indicates the size of the internal allocation
+               unit in bytes if reported by the device. Otherwise the
+               discard_granularity will be set to match the device's
+               physical block size. A discard_granularity of 0 means
+               that the device does not support discard functionality.
+
+What:          /sys/block/<disk>/queue/discard_max_bytes
+Date:          May 2011
+Contact:       Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+               Devices that support discard functionality may have
+               internal limits on the number of bytes that can be
+               trimmed or unmapped in a single operation. Some storage
+               protocols also have inherent limits on the number of
+               blocks that can be described in a single command. The
+               discard_max_bytes parameter is set by the device driver
+               to the maximum number of bytes that can be discarded in
+               a single operation. Discard requests issued to the
+               device must not exceed this limit. A discard_max_bytes
+               value of 0 means that the device does not support
+               discard functionality.
+
+What:          /sys/block/<disk>/queue/discard_zeroes_data
+Date:          May 2011
+Contact:       Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+               Devices that support discard functionality may return
+               stale or random data when a previously discarded block
+               is read back. This can cause problems if the filesystem
+               expects discarded blocks to be explicitly cleared. If a
+               device reports that it deterministically returns zeroes
+               when a discarded area is read the discard_zeroes_data
+               parameter will be set to one. Otherwise it will be 0 and
+               the result of reading a discarded area is undefined.
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c

index 471fdcc..07371cf 100644 (file)
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -385,25 +385,40 @@ void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time,
  
         spin_lock_irqsave(&blkg->stats_lock, flags);
         blkg->stats.time += time;
+#ifdef CONFIG_DEBUG_BLK_CGROUP
         blkg->stats.unaccounted_time += unaccounted_time;
+#endif
         spin_unlock_irqrestore(&blkg->stats_lock, flags);
  }
  EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
  
+/*
+ * should be called under rcu read lock or queue lock to make sure blkg pointer
+ * is valid.
+ */
  void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
                                 uint64_t bytes, bool direction, bool sync)
  {
-       struct blkio_group_stats *stats;
+       struct blkio_group_stats_cpu *stats_cpu;
         unsigned long flags;
  
-       spin_lock_irqsave(&blkg->stats_lock, flags);
-       stats = &blkg->stats;
-       stats->sectors += bytes >> 9;
-       blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICED], 1, direction,
-                       sync);
-       blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_BYTES], bytes,
-                       direction, sync);
-       spin_unlock_irqrestore(&blkg->stats_lock, flags);
+       /*
+        * Disabling interrupts to provide mutual exclusion between two
+        * writes on same cpu. It probably is not needed for 64bit. Not
+        * optimizing that case yet.
+        */
+       local_irq_save(flags);
+
+       stats_cpu = this_cpu_ptr(blkg->stats_cpu);
+
+       u64_stats_update_begin(&stats_cpu->syncp);
+       stats_cpu->sectors += bytes >> 9;
+       blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICED],
+                       1, direction, sync);
+       blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICE_BYTES],
+                       bytes, direction, sync);
+       u64_stats_update_end(&stats_cpu->syncp);
+       local_irq_restore(flags);
  }
  EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats);
  
@@ -426,18 +441,44 @@ void blkiocg_update_completion_stats(struct blkio_group *blkg,
  }
  EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
  
+/*  Merged stats are per cpu.  */
  void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
                                         bool sync)
  {
+       struct blkio_group_stats_cpu *stats_cpu;
         unsigned long flags;
  
-       spin_lock_irqsave(&blkg->stats_lock, flags);
-       blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_MERGED], 1, direction,
-                       sync);
-       spin_unlock_irqrestore(&blkg->stats_lock, flags);
+       /*
+        * Disabling interrupts to provide mutual exclusion between two
+        * writes on same cpu. It probably is not needed for 64bit. Not
+        * optimizing that case yet.
+        */
+       local_irq_save(flags);
+
+       stats_cpu = this_cpu_ptr(blkg->stats_cpu);
+
+       u64_stats_update_begin(&stats_cpu->syncp);
+       blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_MERGED], 1,
+                               direction, sync);
+       u64_stats_update_end(&stats_cpu->syncp);
+       local_irq_restore(flags);
  }
  EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
  
+/*
+ * This function allocates the per cpu stats for blkio_group. Should be called
+ * from sleepable context as alloc_per_cpu() requires that.
+ */
+int blkio_alloc_blkg_stats(struct blkio_group *blkg)
+{
+       /* Allocate memory for per cpu stats */
+       blkg->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu);
+       if (!blkg->stats_cpu)
+               return -ENOMEM;
+       return 0;
+}
+EXPORT_SYMBOL_GPL(blkio_alloc_blkg_stats);
+
  void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
                 struct blkio_group *blkg, void *key, dev_t dev,
                 enum blkio_policy_id plid)
@@ -508,6 +549,30 @@ struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key)
  }
  EXPORT_SYMBOL_GPL(blkiocg_lookup_group);
  
+static void blkio_reset_stats_cpu(struct blkio_group *blkg)
+{
+       struct blkio_group_stats_cpu *stats_cpu;
+       int i, j, k;
+       /*
+        * Note: On 64 bit arch this should not be an issue. This has the
+        * possibility of returning some inconsistent value on 32bit arch
+        * as 64bit update on 32bit is non atomic. Taking care of this
+        * corner case makes code very complicated, like sending IPIs to
+        * cpus, taking care of stats of offline cpus etc.
+        *
+        * reset stats is anyway more of a debug feature and this sounds a
+        * corner case. So I am not complicating the code yet until and
+        * unless this becomes a real issue.
+        */
+       for_each_possible_cpu(i) {
+               stats_cpu = per_cpu_ptr(blkg->stats_cpu, i);
+               stats_cpu->sectors = 0;
+               for(j = 0; j < BLKIO_STAT_CPU_NR; j++)
+                       for (k = 0; k < BLKIO_STAT_TOTAL; k++)
+                               stats_cpu->stat_arr_cpu[j][k] = 0;
+       }
+}
+
  static int
  blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
  {
@@ -552,7 +617,11 @@ blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
                 }
  #endif
                 spin_unlock(&blkg->stats_lock);
+
+               /* Reset Per cpu stats which don't take blkg->stats_lock */
+               blkio_reset_stats_cpu(blkg);
         }
+
         spin_unlock_irq(&blkcg->lock);
         return 0;
  }
@@ -598,6 +667,59 @@ static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val,
         return val;
  }
  
+
+static uint64_t blkio_read_stat_cpu(struct blkio_group *blkg,
+                       enum stat_type_cpu type, enum stat_sub_type sub_type)
+{
+       int cpu;
+       struct blkio_group_stats_cpu *stats_cpu;
+       u64 val = 0, tval;
+
+       for_each_possible_cpu(cpu) {
+               unsigned int start;
+               stats_cpu  = per_cpu_ptr(blkg->stats_cpu, cpu);
+
+               do {
+                       start = u64_stats_fetch_begin(&stats_cpu->syncp);
+                       if (type == BLKIO_STAT_CPU_SECTORS)
+                               tval = stats_cpu->sectors;
+                       else
+                               tval = stats_cpu->stat_arr_cpu[type][sub_type];
+               } while(u64_stats_fetch_retry(&stats_cpu->syncp, start));
+
+               val += tval;
+       }
+
+       return val;
+}
+
+static uint64_t blkio_get_stat_cpu(struct blkio_group *blkg,
+               struct cgroup_map_cb *cb, dev_t dev, enum stat_type_cpu type)
+{
+       uint64_t disk_total, val;
+       char key_str[MAX_KEY_LEN];
+       enum stat_sub_type sub_type;
+
+       if (type == BLKIO_STAT_CPU_SECTORS) {
+               val = blkio_read_stat_cpu(blkg, type, 0);
+               return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, val, cb, dev);
+       }
+
+       for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
+                       sub_type++) {
+               blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false);
+               val = blkio_read_stat_cpu(blkg, type, sub_type);
+               cb->fill(cb, key_str, val);
+       }
+
+       disk_total = blkio_read_stat_cpu(blkg, type, BLKIO_STAT_READ) +
+                       blkio_read_stat_cpu(blkg, type, BLKIO_STAT_WRITE);
+
+       blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false);
+       cb->fill(cb, key_str, disk_total);
+       return disk_total;
+}
+
  /* This should be called with blkg->stats_lock held */
  static uint64_t blkio_get_stat(struct blkio_group *blkg,
                 struct cgroup_map_cb *cb, dev_t dev, enum stat_type type)
@@ -609,9 +731,6 @@ static uint64_t blkio_get_stat(struct blkio_group *blkg,
         if (type == BLKIO_STAT_TIME)
                 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
                                         blkg->stats.time, cb, dev);
-       if (type == BLKIO_STAT_SECTORS)
-               return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
-                                       blkg->stats.sectors, cb, dev);
  #ifdef CONFIG_DEBUG_BLK_CGROUP
         if (type == BLKIO_STAT_UNACCOUNTED_TIME)
                 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
@@ -1075,8 +1194,8 @@ static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft,
  }
  
  static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg,
-               struct cftype *cft, struct cgroup_map_cb *cb, enum stat_type type,
-               bool show_total)
+               struct cftype *cft, struct cgroup_map_cb *cb,
+               enum stat_type type, bool show_total, bool pcpu)
  {
         struct blkio_group *blkg;
         struct hlist_node *n;
@@ -1087,10 +1206,15 @@ static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg,
                 if (blkg->dev) {
                         if (!cftype_blkg_same_policy(cft, blkg))
                                 continue;
-                       spin_lock_irq(&blkg->stats_lock);
-                       cgroup_total += blkio_get_stat(blkg, cb, blkg->dev,
-                                               type);
-                       spin_unlock_irq(&blkg->stats_lock);
+                       if (pcpu)
+                               cgroup_total += blkio_get_stat_cpu(blkg, cb,
+                                               blkg->dev, type);
+                       else {
+                               spin_lock_irq(&blkg->stats_lock);
+                               cgroup_total += blkio_get_stat(blkg, cb,
+                                               blkg->dev, type);
+                               spin_unlock_irq(&blkg->stats_lock);
+                       }
                 }
         }
         if (show_total)
@@ -1114,47 +1238,47 @@ static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
                 switch(name) {
                 case BLKIO_PROP_time:
                         return blkio_read_blkg_stats(blkcg, cft, cb,
-                                               BLKIO_STAT_TIME, 0);
+                                               BLKIO_STAT_TIME, 0, 0);
                 case BLKIO_PROP_sectors:
                         return blkio_read_blkg_stats(blkcg, cft, cb,
-                                               BLKIO_STAT_SECTORS, 0);
+                                               BLKIO_STAT_CPU_SECTORS, 0, 1);
                 case BLKIO_PROP_io_service_bytes:
                         return blkio_read_blkg_stats(blkcg, cft, cb,
-                                               BLKIO_STAT_SERVICE_BYTES, 1);
+                                       BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1);
                 case BLKIO_PROP_io_serviced:
                         return blkio_read_blkg_stats(blkcg, cft, cb,
-                                               BLKIO_STAT_SERVICED, 1);
+                                               BLKIO_STAT_CPU_SERVICED, 1, 1);
                 case BLKIO_PROP_io_service_time:
                         return blkio_read_blkg_stats(blkcg, cft, cb,
-                                               BLKIO_STAT_SERVICE_TIME, 1);
+                                               BLKIO_STAT_SERVICE_TIME, 1, 0);
                 case BLKIO_PROP_io_wait_time:
                         return blkio_read_blkg_stats(blkcg, cft, cb,
-                                               BLKIO_STAT_WAIT_TIME, 1);
+                                               BLKIO_STAT_WAIT_TIME, 1, 0);
                 case BLKIO_PROP_io_merged:
                         return blkio_read_blkg_stats(blkcg, cft, cb,
-                                               BLKIO_STAT_MERGED, 1);
+                                               BLKIO_STAT_CPU_MERGED, 1, 1);
                 case BLKIO_PROP_io_queued:
                         return blkio_read_blkg_stats(blkcg, cft, cb,
-                                               BLKIO_STAT_QUEUED, 1);
+                                               BLKIO_STAT_QUEUED, 1, 0);
  #ifdef CONFIG_DEBUG_BLK_CGROUP
                 case BLKIO_PROP_unaccounted_time:
                         return blkio_read_blkg_stats(blkcg, cft, cb,
-                                               BLKIO_STAT_UNACCOUNTED_TIME, 0);
+                                       BLKIO_STAT_UNACCOUNTED_TIME, 0, 0);
                 case BLKIO_PROP_dequeue:
                         return blkio_read_blkg_stats(blkcg, cft, cb,
-                                               BLKIO_STAT_DEQUEUE, 0);
+                                               BLKIO_STAT_DEQUEUE, 0, 0);
                 case BLKIO_PROP_avg_queue_size:
                         return blkio_read_blkg_stats(blkcg, cft, cb,
-                                               BLKIO_STAT_AVG_QUEUE_SIZE, 0);
+                                       BLKIO_STAT_AVG_QUEUE_SIZE, 0, 0);
                 case BLKIO_PROP_group_wait_time:
                         return blkio_read_blkg_stats(blkcg, cft, cb,
-                                               BLKIO_STAT_GROUP_WAIT_TIME, 0);
+                                       BLKIO_STAT_GROUP_WAIT_TIME, 0, 0);
                 case BLKIO_PROP_idle_time:
                         return blkio_read_blkg_stats(blkcg, cft, cb,
-                                               BLKIO_STAT_IDLE_TIME, 0);
+                                               BLKIO_STAT_IDLE_TIME, 0, 0);
                 case BLKIO_PROP_empty_time:
                         return blkio_read_blkg_stats(blkcg, cft, cb,
-                                               BLKIO_STAT_EMPTY_TIME, 0);
+                                               BLKIO_STAT_EMPTY_TIME, 0, 0);
  #endif
                 default:
                         BUG();
@@ -1164,10 +1288,10 @@ static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
                 switch(name){
                 case BLKIO_THROTL_io_service_bytes:
                         return blkio_read_blkg_stats(blkcg, cft, cb,
-                                               BLKIO_STAT_SERVICE_BYTES, 1);
+                                               BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1);
                 case BLKIO_THROTL_io_serviced:
                         return blkio_read_blkg_stats(blkcg, cft, cb,
-                                               BLKIO_STAT_SERVICED, 1);
+                                               BLKIO_STAT_CPU_SERVICED, 1, 1);
                 default:
                         BUG();
                 }
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h

index c774930..a71d290 100644 (file)
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -14,6 +14,7 @@
   */
  
  #include <linux/cgroup.h>
+#include <linux/u64_stats_sync.h>
  
  enum blkio_policy_id {
         BLKIO_POLICY_PROP = 0,          /* Proportional Bandwidth division */
@@ -36,22 +37,15 @@ enum stat_type {
          * request completion for IOs doen by this cgroup. This may not be
          * accurate when NCQ is turned on. */
         BLKIO_STAT_SERVICE_TIME = 0,
-       /* Total bytes transferred */
-       BLKIO_STAT_SERVICE_BYTES,
-       /* Total IOs serviced, post merge */
-       BLKIO_STAT_SERVICED,
         /* Total time spent waiting in scheduler queue in ns */
         BLKIO_STAT_WAIT_TIME,
-       /* Number of IOs merged */
-       BLKIO_STAT_MERGED,
         /* Number of IOs queued up */
         BLKIO_STAT_QUEUED,
         /* All the single valued stats go below this */
         BLKIO_STAT_TIME,
-       BLKIO_STAT_SECTORS,
+#ifdef CONFIG_DEBUG_BLK_CGROUP
         /* Time not charged to this cgroup */
         BLKIO_STAT_UNACCOUNTED_TIME,
-#ifdef CONFIG_DEBUG_BLK_CGROUP
         BLKIO_STAT_AVG_QUEUE_SIZE,
         BLKIO_STAT_IDLE_TIME,
         BLKIO_STAT_EMPTY_TIME,
@@ -60,6 +54,18 @@ enum stat_type {
  #endif
  };
  
+/* Per cpu stats */
+enum stat_type_cpu {
+       BLKIO_STAT_CPU_SECTORS,
+       /* Total bytes transferred */
+       BLKIO_STAT_CPU_SERVICE_BYTES,
+       /* Total IOs serviced, post merge */
+       BLKIO_STAT_CPU_SERVICED,
+       /* Number of IOs merged */
+       BLKIO_STAT_CPU_MERGED,
+       BLKIO_STAT_CPU_NR
+};
+
  enum stat_sub_type {
         BLKIO_STAT_READ = 0,
         BLKIO_STAT_WRITE,
@@ -116,11 +122,11 @@ struct blkio_cgroup {
  struct blkio_group_stats {
         /* total disk time and nr sectors dispatched by this group */
         uint64_t time;
-       uint64_t sectors;
-       /* Time not charged to this cgroup */
-       uint64_t unaccounted_time;
         uint64_t stat_arr[BLKIO_STAT_QUEUED + 1][BLKIO_STAT_TOTAL];
  #ifdef CONFIG_DEBUG_BLK_CGROUP
+       /* Time not charged to this cgroup */
+       uint64_t unaccounted_time;
+
         /* Sum of number of IOs queued across all samples */
         uint64_t avg_queue_size_sum;
         /* Count of samples taken for average */
@@ -145,6 +151,13 @@ struct blkio_group_stats {
  #endif
  };
  
+/* Per cpu blkio group stats */
+struct blkio_group_stats_cpu {
+       uint64_t sectors;
+       uint64_t stat_arr_cpu[BLKIO_STAT_CPU_NR][BLKIO_STAT_TOTAL];
+       struct u64_stats_sync syncp;
+};
+
  struct blkio_group {
         /* An rcu protected unique identifier for the group */
         void *key;
@@ -160,6 +173,8 @@ struct blkio_group {
         /* Need to serialize the stats in the case of reset/update */
         spinlock_t stats_lock;
         struct blkio_group_stats stats;
+       /* Per cpu stats pointer */
+       struct blkio_group_stats_cpu __percpu *stats_cpu;
  };
  
  struct blkio_policy_node {
@@ -295,6 +310,7 @@ extern struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk);
  extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
         struct blkio_group *blkg, void *key, dev_t dev,
         enum blkio_policy_id plid);
+extern int blkio_alloc_blkg_stats(struct blkio_group *blkg);
  extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
  extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
                                                 void *key);
@@ -322,6 +338,8 @@ static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
                 struct blkio_group *blkg, void *key, dev_t dev,
                 enum blkio_policy_id plid) {}
  
+static inline int blkio_alloc_blkg_stats(struct blkio_group *blkg) { return 0; }
+
  static inline int
  blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
  
diff --git a/block/blk-core.c b/block/blk-core.c

index 3fe00a1..c8303e9 100644 (file)
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -569,8 +569,6 @@ int blk_get_queue(struct request_queue *q)
  
  static inline void blk_free_request(struct request_queue *q, struct request *rq)
  {
-       BUG_ON(rq->cmd_flags & REQ_ON_PLUG);
-
         if (rq->cmd_flags & REQ_ELVPRIV)
                 elv_put_request(q, rq);
         mempool_free(rq, q->rq.rq_pool);
@@ -1110,14 +1108,6 @@ static bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
  {
         const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
  
-       /*
-        * Debug stuff, kill later
-        */
-       if (!rq_mergeable(req)) {
-               blk_dump_rq_flags(req, "back");
-               return false;
-       }
-
         if (!ll_back_merge_fn(q, req, bio))
                 return false;
  
@@ -1132,6 +1122,7 @@ static bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
         req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
  
         drive_stat_acct(req, 0);
+       elv_bio_merged(q, req, bio);
         return true;
  }
  
@@ -1141,14 +1132,6 @@ static bool bio_attempt_front_merge(struct request_queue *q,
         const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
         sector_t sector;
  
-       /*
-        * Debug stuff, kill later
-        */
-       if (!rq_mergeable(req)) {
-               blk_dump_rq_flags(req, "front");
-               return false;
-       }
-
         if (!ll_front_merge_fn(q, req, bio))
                 return false;
  
@@ -1173,6 +1156,7 @@ static bool bio_attempt_front_merge(struct request_queue *q,
         req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
  
         drive_stat_acct(req, 0);
+       elv_bio_merged(q, req, bio);
         return true;
  }
  
@@ -1258,14 +1242,12 @@ static int __make_request(struct request_queue *q, struct bio *bio)
  
         el_ret = elv_merge(q, &req, bio);
         if (el_ret == ELEVATOR_BACK_MERGE) {
-               BUG_ON(req->cmd_flags & REQ_ON_PLUG);
                 if (bio_attempt_back_merge(q, req, bio)) {
                         if (!attempt_back_merge(q, req))
                                 elv_merged_request(q, req, el_ret);
                         goto out_unlock;
                 }
         } else if (el_ret == ELEVATOR_FRONT_MERGE) {
-               BUG_ON(req->cmd_flags & REQ_ON_PLUG);
                 if (bio_attempt_front_merge(q, req, bio)) {
                         if (!attempt_front_merge(q, req))
                                 elv_merged_request(q, req, el_ret);
@@ -1320,10 +1302,6 @@ get_rq:
                         if (__rq->q != q)
                                 plug->should_sort = 1;
                 }
-               /*
-                * Debug flag, kill later
-                */
-               req->cmd_flags |= REQ_ON_PLUG;
                 list_add_tail(&req->queuelist, &plug->list);
                 drive_stat_acct(req, 1);
         } else {
@@ -1550,7 +1528,8 @@ static inline void __generic_make_request(struct bio *bio)
                         goto end_io;
                 }
  
-               blk_throtl_bio(q, &bio);
+               if (blk_throtl_bio(q, &bio))
+                       goto end_io;
  
                 /*
                  * If bio = NULL, bio has been throttled and will be submitted
@@ -2748,7 +2727,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
         while (!list_empty(&list)) {
                 rq = list_entry_rq(list.next);
                 list_del_init(&rq->queuelist);
-               BUG_ON(!(rq->cmd_flags & REQ_ON_PLUG));
                 BUG_ON(!rq->q);
                 if (rq->q != q) {
                         /*
@@ -2760,8 +2738,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
                         depth = 0;
                         spin_lock(q->queue_lock);
                 }
-               rq->cmd_flags &= ~REQ_ON_PLUG;
-
                 /*
                  * rq is already accounted, so use raw insert
                  */
diff --git a/block/blk-exec.c b/block/blk-exec.c

index 81e3181..8a0e7ec 100644 (file)
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -56,7 +56,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
         spin_lock_irq(q->queue_lock);
         __elv_add_request(q, rq, where);
         __blk_run_queue(q);
-       /* the queue is stopped so it won't be plugged+unplugged */
+       /* the queue is stopped so it won't be run */
         if (rq->cmd_type == REQ_TYPE_PM_RESUME)
                 q->request_fn(q);
         spin_unlock_irq(q->queue_lock);
diff --git a/block/blk-flush.c b/block/blk-flush.c

index 6c9b5e1..bb21e4c 100644 (file)
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -212,13 +212,19 @@ static void flush_end_io(struct request *flush_rq, int error)
         }
  
         /*
-        * Moving a request silently to empty queue_head may stall the
-        * queue.  Kick the queue in those cases.  This function is called
-        * from request completion path and calling directly into
-        * request_fn may confuse the driver.  Always use kblockd.
+        * Kick the queue to avoid stall for two cases:
+        * 1. Moving a request silently to empty queue_head may stall the
+        * queue.
+        * 2. When flush request is running in non-queueable queue, the
+        * queue is hold. Restart the queue after flush request is finished
+        * to avoid stall.
+        * This function is called from request completion path and calling
+        * directly into request_fn may confuse the driver.  Always use
+        * kblockd.
          */
-       if (queued)
+       if (queued || q->flush_queue_delayed)
                 blk_run_queue_async(q);
+       q->flush_queue_delayed = 0;
  }
  
  /**
diff --git a/block/blk-ioc.c b/block/blk-ioc.c

index b791022..c898049 100644 (file)
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -96,6 +96,9 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
                 INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH);
                 INIT_HLIST_HEAD(&ret->cic_list);
                 ret->ioc_data = NULL;
+#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
+               ret->cgroup_changed = 0;
+#endif
         }
  
         return ret;
diff --git a/block/blk-lib.c b/block/blk-lib.c

index 25de73e..78e627e 100644 (file)
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -9,17 +9,20 @@
  
  #include "blk.h"
  
-static void blkdev_discard_end_io(struct bio *bio, int err)
-{
-       if (err) {
-               if (err == -EOPNOTSUPP)
-                       set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
-               clear_bit(BIO_UPTODATE, &bio->bi_flags);
-       }
+struct bio_batch {
+       atomic_t                done;
+       unsigned long           flags;
+       struct completion       *wait;
+};
  
-       if (bio->bi_private)
-               complete(bio->bi_private);
+static void bio_batch_end_io(struct bio *bio, int err)
+{
+       struct bio_batch *bb = bio->bi_private;
  
+       if (err && (err != -EOPNOTSUPP))
+               clear_bit(BIO_UPTODATE, &bb->flags);
+       if (atomic_dec_and_test(&bb->done))
+               complete(bb->wait);
         bio_put(bio);
  }
  
@@ -41,6 +44,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
         struct request_queue *q = bdev_get_queue(bdev);
         int type = REQ_WRITE | REQ_DISCARD;
         unsigned int max_discard_sectors;
+       struct bio_batch bb;
         struct bio *bio;
         int ret = 0;
  
@@ -67,7 +71,11 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
                 type |= REQ_SECURE;
         }
  
-       while (nr_sects && !ret) {
+       atomic_set(&bb.done, 1);
+       bb.flags = 1 << BIO_UPTODATE;
+       bb.wait = &wait;
+
+       while (nr_sects) {
                 bio = bio_alloc(gfp_mask, 1);
                 if (!bio) {
                         ret = -ENOMEM;
@@ -75,9 +83,9 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
                 }
  
                 bio->bi_sector = sector;
-               bio->bi_end_io = blkdev_discard_end_io;
+               bio->bi_end_io = bio_batch_end_io;
                 bio->bi_bdev = bdev;
-               bio->bi_private = &wait;
+               bio->bi_private = &bb;
  
                 if (nr_sects > max_discard_sectors) {
                         bio->bi_size = max_discard_sectors << 9;
@@ -88,45 +96,21 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
                         nr_sects = 0;
                 }
  
-               bio_get(bio);
+               atomic_inc(&bb.done);
                 submit_bio(type, bio);
+       }
  
+       /* Wait for bios in-flight */
+       if (!atomic_dec_and_test(&bb.done))
                 wait_for_completion(&wait);
  
-               if (bio_flagged(bio, BIO_EOPNOTSUPP))
-                       ret = -EOPNOTSUPP;
-               else if (!bio_flagged(bio, BIO_UPTODATE))
-                       ret = -EIO;
-               bio_put(bio);
-       }
+       if (!test_bit(BIO_UPTODATE, &bb.flags))
+               ret = -EIO;
  
         return ret;
  }
  EXPORT_SYMBOL(blkdev_issue_discard);
  
-struct bio_batch
-{
-       atomic_t                done;
-       unsigned long           flags;
-       struct completion       *wait;
-};
-
-static void bio_batch_end_io(struct bio *bio, int err)
-{
-       struct bio_batch *bb = bio->bi_private;
-
-       if (err) {
-               if (err == -EOPNOTSUPP)
-                       set_bit(BIO_EOPNOTSUPP, &bb->flags);
-               else
-                       clear_bit(BIO_UPTODATE, &bb->flags);
-       }
-       if (bb)
-               if (atomic_dec_and_test(&bb->done))
-                       complete(bb->wait);
-       bio_put(bio);
-}
-
  /**
   * blkdev_issue_zeroout - generate number of zero filed write bios
   * @bdev:      blockdev to issue
@@ -151,7 +135,6 @@ int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
         bb.flags = 1 << BIO_UPTODATE;
         bb.wait = &wait;
  
-submit:
         ret = 0;
         while (nr_sects != 0) {
                 bio = bio_alloc(gfp_mask,
@@ -168,9 +151,6 @@ submit:
  
                 while (nr_sects != 0) {
                         sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects);
-                       if (sz == 0)
-                               /* bio has maximum size possible */
-                               break;
                         ret = bio_add_page(bio, ZERO_PAGE(0), sz << 9, 0);
                         nr_sects -= ret >> 9;
                         sector += ret >> 9;
@@ -190,16 +170,6 @@ submit:
                 /* One of bios in the batch was completed with error.*/
                 ret = -EIO;
  
-       if (ret)
-               goto out;
-
-       if (test_bit(BIO_EOPNOTSUPP, &bb.flags)) {
-               ret = -EOPNOTSUPP;
-               goto out;
-       }
-       if (nr_sects != 0)
-               goto submit;
-out:
         return ret;
  }
  EXPORT_SYMBOL(blkdev_issue_zeroout);
diff --git a/block/blk-settings.c b/block/blk-settings.c

index 1fa7692..fa1eb04 100644 (file)
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -120,7 +120,7 @@ void blk_set_default_limits(struct queue_limits *lim)
         lim->discard_granularity = 0;
         lim->discard_alignment = 0;
         lim->discard_misaligned = 0;
-       lim->discard_zeroes_data = -1;
+       lim->discard_zeroes_data = 1;
         lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
         lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT);
         lim->alignment_offset = 0;
@@ -166,6 +166,7 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
  
         blk_set_default_limits(&q->limits);
         blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS);
+       q->limits.discard_zeroes_data = 0;
  
         /*
          * by default assume old behaviour and bounce for any highmem page
@@ -790,6 +791,12 @@ void blk_queue_flush(struct request_queue *q, unsigned int flush)
  }
  EXPORT_SYMBOL_GPL(blk_queue_flush);
  
+void blk_queue_flush_queueable(struct request_queue *q, bool queueable)
+{
+       q->flush_not_queueable = !queueable;
+}
+EXPORT_SYMBOL_GPL(blk_queue_flush_queueable);
+
  static int __init blk_settings_init(void)
  {
         blk_max_low_pfn = max_low_pfn - 1;
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c

index bd23631..d935bd8 100644 (file)
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -152,7 +152,8 @@ static ssize_t queue_discard_granularity_show(struct request_queue *q, char *pag
  
  static ssize_t queue_discard_max_show(struct request_queue *q, char *page)
  {
-       return queue_var_show(q->limits.max_discard_sectors << 9, page);
+       return sprintf(page, "%llu\n",
+                      (unsigned long long)q->limits.max_discard_sectors << 9);
  }
  
  static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *page)
diff --git a/block/blk-throttle.c b/block/blk-throttle.c

index 252a81a..a62be8d 100644 (file)
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -78,6 +78,8 @@ struct throtl_grp {
  
         /* Some throttle limits got updated for the group */
         int limits_changed;
+
+       struct rcu_head rcu_head;
  };
  
  struct throtl_data
@@ -88,7 +90,7 @@ struct throtl_data
         /* service tree for active throtl groups */
         struct throtl_rb_root tg_service_tree;
  
-       struct throtl_grp root_tg;
+       struct throtl_grp *root_tg;
         struct request_queue *queue;
  
         /* Total Number of queued bios on READ and WRITE lists */
@@ -151,56 +153,44 @@ static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg)
         return tg;
  }
  
-static void throtl_put_tg(struct throtl_grp *tg)
+static void throtl_free_tg(struct rcu_head *head)
  {
-       BUG_ON(atomic_read(&tg->ref) <= 0);
-       if (!atomic_dec_and_test(&tg->ref))
-               return;
+       struct throtl_grp *tg;
+
+       tg = container_of(head, struct throtl_grp, rcu_head);
+       free_percpu(tg->blkg.stats_cpu);
         kfree(tg);
  }
  
-static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td,
-                       struct blkio_cgroup *blkcg)
+static void throtl_put_tg(struct throtl_grp *tg)
  {
-       struct throtl_grp *tg = NULL;
-       void *key = td;
-       struct backing_dev_info *bdi = &td->queue->backing_dev_info;
-       unsigned int major, minor;
+       BUG_ON(atomic_read(&tg->ref) <= 0);
+       if (!atomic_dec_and_test(&tg->ref))
+               return;
  
         /*
-        * TODO: Speed up blkiocg_lookup_group() by maintaining a radix
-        * tree of blkg (instead of traversing through hash list all
-        * the time.
+        * A group is freed in rcu manner. But having an rcu lock does not
+        * mean that one can access all the fields of blkg and assume these
+        * are valid. For example, don't try to follow throtl_data and
+        * request queue links.
+        *
+        * Having a reference to blkg under an rcu allows acess to only
+        * values local to groups like group stats and group rate limits
          */
+       call_rcu(&tg->rcu_head, throtl_free_tg);
+}
  
-       /*
-        * This is the common case when there are no blkio cgroups.
-        * Avoid lookup in this case
-        */
-       if (blkcg == &blkio_root_cgroup)
-               tg = &td->root_tg;
-       else
-               tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key));
-
-       /* Fill in device details for root group */
-       if (tg && !tg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
-               sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
-               tg->blkg.dev = MKDEV(major, minor);
-               goto done;
-       }
-
-       if (tg)
-               goto done;
-
-       tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node);
-       if (!tg)
-               goto done;
-
+static void throtl_init_group(struct throtl_grp *tg)
+{
         INIT_HLIST_NODE(&tg->tg_node);
         RB_CLEAR_NODE(&tg->rb_node);
         bio_list_init(&tg->bio_lists[0]);
         bio_list_init(&tg->bio_lists[1]);
-       td->limits_changed = false;
+       tg->limits_changed = false;
+
+       /* Practically unlimited BW */
+       tg->bps[0] = tg->bps[1] = -1;
+       tg->iops[0] = tg->iops[1] = -1;
  
         /*
          * Take the initial reference that will be released on destroy
@@ -209,33 +199,181 @@ static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td,
          * exit or cgroup deletion path depending on who is exiting first.
          */
         atomic_set(&tg->ref, 1);
+}
+
+/* Should be called with rcu read lock held (needed for blkcg) */
+static void
+throtl_add_group_to_td_list(struct throtl_data *td, struct throtl_grp *tg)
+{
+       hlist_add_head(&tg->tg_node, &td->tg_list);
+       td->nr_undestroyed_grps++;
+}
+
+static void
+__throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg)
+{
+       struct backing_dev_info *bdi = &td->queue->backing_dev_info;
+       unsigned int major, minor;
+
+       if (!tg || tg->blkg.dev)
+               return;
+
+       /*
+        * Fill in device details for a group which might not have been
+        * filled at group creation time as queue was being instantiated
+        * and driver had not attached a device yet
+        */
+       if (bdi->dev && dev_name(bdi->dev)) {
+               sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+               tg->blkg.dev = MKDEV(major, minor);
+       }
+}
+
+/*
+ * Should be called with without queue lock held. Here queue lock will be
+ * taken rarely. It will be taken only once during life time of a group
+ * if need be
+ */
+static void
+throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg)
+{
+       if (!tg || tg->blkg.dev)
+               return;
+
+       spin_lock_irq(td->queue->queue_lock);
+       __throtl_tg_fill_dev_details(td, tg);
+       spin_unlock_irq(td->queue->queue_lock);
+}
+
+static void throtl_init_add_tg_lists(struct throtl_data *td,
+                       struct throtl_grp *tg, struct blkio_cgroup *blkcg)
+{
+       __throtl_tg_fill_dev_details(td, tg);
  
         /* Add group onto cgroup list */
-       sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
         blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td,
-                               MKDEV(major, minor), BLKIO_POLICY_THROTL);
+                               tg->blkg.dev, BLKIO_POLICY_THROTL);
  
         tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev);
         tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev);
         tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev);
         tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev);
  
-       hlist_add_head(&tg->tg_node, &td->tg_list);
-       td->nr_undestroyed_grps++;
-done:
+       throtl_add_group_to_td_list(td, tg);
+}
+
+/* Should be called without queue lock and outside of rcu period */
+static struct throtl_grp *throtl_alloc_tg(struct throtl_data *td)
+{
+       struct throtl_grp *tg = NULL;
+       int ret;
+
+       tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node);
+       if (!tg)
+               return NULL;
+
+       ret = blkio_alloc_blkg_stats(&tg->blkg);
+
+       if (ret) {
+               kfree(tg);
+               return NULL;
+       }
+
+       throtl_init_group(tg);
         return tg;
  }
  
-static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
+static struct
+throtl_grp *throtl_find_tg(struct throtl_data *td, struct blkio_cgroup *blkcg)
  {
         struct throtl_grp *tg = NULL;
+       void *key = td;
+
+       /*
+        * This is the common case when there are no blkio cgroups.
+        * Avoid lookup in this case
+        */
+       if (blkcg == &blkio_root_cgroup)
+               tg = td->root_tg;
+       else
+               tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key));
+
+       __throtl_tg_fill_dev_details(td, tg);
+       return tg;
+}
+
+/*
+ * This function returns with queue lock unlocked in case of error, like
+ * request queue is no more
+ */
+static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
+{
+       struct throtl_grp *tg = NULL, *__tg = NULL;
         struct blkio_cgroup *blkcg;
+       struct request_queue *q = td->queue;
  
         rcu_read_lock();
         blkcg = task_blkio_cgroup(current);
-       tg = throtl_find_alloc_tg(td, blkcg);
-       if (!tg)
-               tg = &td->root_tg;
+       tg = throtl_find_tg(td, blkcg);
+       if (tg) {
+               rcu_read_unlock();
+               return tg;
+       }
+
+       /*
+        * Need to allocate a group. Allocation of group also needs allocation
+        * of per cpu stats which in-turn takes a mutex() and can block. Hence
+        * we need to drop rcu lock and queue_lock before we call alloc
+        *
+        * Take the request queue reference to make sure queue does not
+        * go away once we return from allocation.
+        */
+       blk_get_queue(q);
+       rcu_read_unlock();
+       spin_unlock_irq(q->queue_lock);
+
+       tg = throtl_alloc_tg(td);
+       /*
+        * We might have slept in group allocation. Make sure queue is not
+        * dead
+        */
+       if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
+               blk_put_queue(q);
+               if (tg)
+                       kfree(tg);
+
+               return ERR_PTR(-ENODEV);
+       }
+       blk_put_queue(q);
+
+       /* Group allocated and queue is still alive. take the lock */
+       spin_lock_irq(q->queue_lock);
+
+       /*
+        * Initialize the new group. After sleeping, read the blkcg again.
+        */
+       rcu_read_lock();
+       blkcg = task_blkio_cgroup(current);
+
+       /*
+        * If some other thread already allocated the group while we were
+        * not holding queue lock, free up the group
+        */
+       __tg = throtl_find_tg(td, blkcg);
+
+       if (__tg) {
+               kfree(tg);
+               rcu_read_unlock();
+               return __tg;
+       }
+
+       /* Group allocation failed. Account the IO to root group */
+       if (!tg) {
+               tg = td->root_tg;
+               return tg;
+       }
+
+       throtl_init_add_tg_lists(td, tg, blkcg);
         rcu_read_unlock();
         return tg;
  }
@@ -544,6 +682,12 @@ static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg,
         return 0;
  }
  
+static bool tg_no_rule_group(struct throtl_grp *tg, bool rw) {
+       if (tg->bps[rw] == -1 && tg->iops[rw] == -1)
+               return 1;
+       return 0;
+}
+
  /*
   * Returns whether one can dispatch a bio or not. Also returns approx number
   * of jiffies to wait before this bio is with-in IO rate and can be dispatched
@@ -608,10 +752,6 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
         tg->bytes_disp[rw] += bio->bi_size;
         tg->io_disp[rw]++;
  
-       /*
-        * TODO: This will take blkg->stats_lock. Figure out a way
-        * to avoid this cost.
-        */
         blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync);
  }
  
@@ -989,15 +1129,51 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop)
         struct throtl_grp *tg;
         struct bio *bio = *biop;
         bool rw = bio_data_dir(bio), update_disptime = true;
+       struct blkio_cgroup *blkcg;
  
         if (bio->bi_rw & REQ_THROTTLED) {
                 bio->bi_rw &= ~REQ_THROTTLED;
                 return 0;
         }
  
+       /*
+        * A throtl_grp pointer retrieved under rcu can be used to access
+        * basic fields like stats and io rates. If a group has no rules,
+        * just update the dispatch stats in lockless manner and return.
+        */
+
+       rcu_read_lock();
+       blkcg = task_blkio_cgroup(current);
+       tg = throtl_find_tg(td, blkcg);
+       if (tg) {
+               throtl_tg_fill_dev_details(td, tg);
+
+               if (tg_no_rule_group(tg, rw)) {
+                       blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size,
+                                       rw, bio->bi_rw & REQ_SYNC);
+                       rcu_read_unlock();
+                       return 0;
+               }
+       }
+       rcu_read_unlock();
+
+       /*
+        * Either group has not been allocated yet or it is not an unlimited
+        * IO group
+        */
+
         spin_lock_irq(q->queue_lock);
         tg = throtl_get_tg(td);
  
+       if (IS_ERR(tg)) {
+               if (PTR_ERR(tg) == -ENODEV) {
+                       /*
+                        * Queue is gone. No queue lock held here.
+                        */
+                       return -ENODEV;
+               }
+       }
+
         if (tg->nr_queued[rw]) {
                 /*
                  * There is already another bio queued in same dir. No
@@ -1060,39 +1236,24 @@ int blk_throtl_init(struct request_queue *q)
         INIT_HLIST_HEAD(&td->tg_list);
         td->tg_service_tree = THROTL_RB_ROOT;
         td->limits_changed = false;
+       INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
  
-       /* Init root group */
-       tg = &td->root_tg;
-       INIT_HLIST_NODE(&tg->tg_node);
-       RB_CLEAR_NODE(&tg->rb_node);
-       bio_list_init(&tg->bio_lists[0]);
-       bio_list_init(&tg->bio_lists[1]);
-
-       /* Practically unlimited BW */
-       tg->bps[0] = tg->bps[1] = -1;
-       tg->iops[0] = tg->iops[1] = -1;
-       td->limits_changed = false;
+       /* alloc and Init root group. */
+       td->queue = q;
+       tg = throtl_alloc_tg(td);
  
-       /*
-        * Set root group reference to 2. One reference will be dropped when
-        * all groups on tg_list are being deleted during queue exit. Other
-        * reference will remain there as we don't want to delete this group
-        * as it is statically allocated and gets destroyed when throtl_data
-        * goes away.
-        */
-       atomic_set(&tg->ref, 2);
-       hlist_add_head(&tg->tg_node, &td->tg_list);
-       td->nr_undestroyed_grps++;
+       if (!tg) {
+               kfree(td);
+               return -ENOMEM;
+       }
  
-       INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
+       td->root_tg = tg;
  
         rcu_read_lock();
-       blkiocg_add_blkio_group(&blkio_root_cgroup, &tg->blkg, (void *)td,
-                                       0, BLKIO_POLICY_THROTL);
+       throtl_init_add_tg_lists(td, tg, &blkio_root_cgroup);
         rcu_read_unlock();
  
         /* Attach throtl data to request queue */
-       td->queue = q;
         q->td = td;
         return 0;
  }
diff --git a/block/blk.h b/block/blk.h

index 6126346..d658628 100644 (file)
--- a/block/blk.h
+++ b/block/blk.h
@@ -62,7 +62,28 @@ static inline struct request *__elv_next_request(struct request_queue *q)
                         return rq;
                 }
  
-               if (!q->elevator->ops->elevator_dispatch_fn(q, 0))
+               /*
+                * Flush request is running and flush request isn't queueable
+                * in the drive, we can hold the queue till flush request is
+                * finished. Even we don't do this, driver can't dispatch next
+                * requests and will requeue them. And this can improve
+                * throughput too. For example, we have request flush1, write1,
+                * flush 2. flush1 is dispatched, then queue is hold, write1
+                * isn't inserted to queue. After flush1 is finished, flush2
+                * will be dispatched. Since disk cache is already clean,
+                * flush2 will be finished very soon, so looks like flush2 is
+                * folded to flush1.
+                * Since the queue is hold, a flag is set to indicate the queue
+                * should be restarted later. Please see flush_end_io() for
+                * details.
+                */
+               if (q->flush_pending_idx != q->flush_running_idx &&
+                               !queue_flush_queueable(q)) {
+                       q->flush_queue_delayed = 1;
+                       return NULL;
+               }
+               if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags) ||
+                   !q->elevator->ops->elevator_dispatch_fn(q, 0))
                         return NULL;
         }
  }
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c

index ab7a9e6..7c52d68 100644 (file)
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -300,7 +300,9 @@ struct cfq_data {
  
         /* List of cfq groups being managed on this device*/
         struct hlist_head cfqg_list;
-       struct rcu_head rcu;
+
+       /* Number of groups which are on blkcg->blkg_list */
+       unsigned int nr_blkcg_linked_grps;
  };
  
  static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
@@ -665,15 +667,11 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2,
         if (rq2 == NULL)
                 return rq1;
  
-       if (rq_is_sync(rq1) && !rq_is_sync(rq2))
-               return rq1;
-       else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
-               return rq2;
-       if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
-               return rq1;
-       else if ((rq2->cmd_flags & REQ_META) &&
-                !(rq1->cmd_flags & REQ_META))
-               return rq2;
+       if (rq_is_sync(rq1) != rq_is_sync(rq2))
+               return rq_is_sync(rq1) ? rq1 : rq2;
+
+       if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_META)
+               return rq1->cmd_flags & REQ_META ? rq1 : rq2;
  
         s1 = blk_rq_pos(rq1);
         s2 = blk_rq_pos(rq2);
@@ -1014,28 +1012,47 @@ void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
         cfqg->needs_update = true;
  }
  
-static struct cfq_group * cfq_find_alloc_cfqg(struct cfq_data *cfqd,
-               struct blkio_cgroup *blkcg, int create)
+static void cfq_init_add_cfqg_lists(struct cfq_data *cfqd,
+                       struct cfq_group *cfqg, struct blkio_cgroup *blkcg)
  {
-       struct cfq_group *cfqg = NULL;
-       void *key = cfqd;
-       int i, j;
-       struct cfq_rb_root *st;
         struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
         unsigned int major, minor;
  
-       cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
-       if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
+       /*
+        * Add group onto cgroup list. It might happen that bdi->dev is
+        * not initialized yet. Initialize this new group without major
+        * and minor info and this info will be filled in once a new thread
+        * comes for IO.
+        */
+       if (bdi->dev) {
                 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
-               cfqg->blkg.dev = MKDEV(major, minor);
-               goto done;
-       }
-       if (cfqg || !create)
-               goto done;
+               cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg,
+                                       (void *)cfqd, MKDEV(major, minor));
+       } else
+               cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg,
+                                       (void *)cfqd, 0);
+
+       cfqd->nr_blkcg_linked_grps++;
+       cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
+
+       /* Add group on cfqd list */
+       hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
+}
+
+/*
+ * Should be called from sleepable context. No request queue lock as per
+ * cpu stats are allocated dynamically and alloc_percpu needs to be called
+ * from sleepable context.
+ */
+static struct cfq_group * cfq_alloc_cfqg(struct cfq_data *cfqd)
+{
+       struct cfq_group *cfqg = NULL;
+       int i, j, ret;
+       struct cfq_rb_root *st;
  
         cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
         if (!cfqg)
-               goto done;
+               return NULL;
  
         for_each_cfqg_st(cfqg, i, j, st)
                 *st = CFQ_RB_ROOT;
@@ -1049,43 +1066,94 @@ static struct cfq_group * cfq_find_alloc_cfqg(struct cfq_data *cfqd,
          */
         cfqg->ref = 1;
  
+       ret = blkio_alloc_blkg_stats(&cfqg->blkg);
+       if (ret) {
+               kfree(cfqg);
+               return NULL;
+       }
+
+       return cfqg;
+}
+
+static struct cfq_group *
+cfq_find_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg)
+{
+       struct cfq_group *cfqg = NULL;
+       void *key = cfqd;
+       struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
+       unsigned int major, minor;
+
         /*
-        * Add group onto cgroup list. It might happen that bdi->dev is
-        * not initialized yet. Initialize this new group without major
-        * and minor info and this info will be filled in once a new thread
-        * comes for IO. See code above.
+        * This is the common case when there are no blkio cgroups.
+        * Avoid lookup in this case
          */
-       if (bdi->dev) {
-               sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
-               cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
-                                       MKDEV(major, minor));
-       } else
-               cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
-                                       0);
-
-       cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
+       if (blkcg == &blkio_root_cgroup)
+               cfqg = &cfqd->root_group;
+       else
+               cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
  
-       /* Add group on cfqd list */
-       hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
+       if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
+               sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+               cfqg->blkg.dev = MKDEV(major, minor);
+       }
  
-done:
         return cfqg;
  }
  
  /*
- * Search for the cfq group current task belongs to. If create = 1, then also
- * create the cfq group if it does not exist. request_queue lock must be held.
+ * Search for the cfq group current task belongs to. request_queue lock must
+ * be held.
   */
-static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
+static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd)
  {
         struct blkio_cgroup *blkcg;
-       struct cfq_group *cfqg = NULL;
+       struct cfq_group *cfqg = NULL, *__cfqg = NULL;
+       struct request_queue *q = cfqd->queue;
  
         rcu_read_lock();
         blkcg = task_blkio_cgroup(current);
-       cfqg = cfq_find_alloc_cfqg(cfqd, blkcg, create);
-       if (!cfqg && create)
+       cfqg = cfq_find_cfqg(cfqd, blkcg);
+       if (cfqg) {
+               rcu_read_unlock();
+               return cfqg;
+       }
+
+       /*
+        * Need to allocate a group. Allocation of group also needs allocation
+        * of per cpu stats which in-turn takes a mutex() and can block. Hence
+        * we need to drop rcu lock and queue_lock before we call alloc.
+        *
+        * Not taking any queue reference here and assuming that queue is
+        * around by the time we return. CFQ queue allocation code does
+        * the same. It might be racy though.
+        */
+
+       rcu_read_unlock();
+       spin_unlock_irq(q->queue_lock);
+
+       cfqg = cfq_alloc_cfqg(cfqd);
+
+       spin_lock_irq(q->queue_lock);
+
+       rcu_read_lock();
+       blkcg = task_blkio_cgroup(current);
+
+       /*
+        * If some other thread already allocated the group while we were
+        * not holding queue lock, free up the group
+        */
+       __cfqg = cfq_find_cfqg(cfqd, blkcg);
+
+       if (__cfqg) {
+               kfree(cfqg);
+               rcu_read_unlock();
+               return __cfqg;
+       }
+
+       if (!cfqg)
                 cfqg = &cfqd->root_group;
+
+       cfq_init_add_cfqg_lists(cfqd, cfqg, blkcg);
         rcu_read_unlock();
         return cfqg;
  }
@@ -1118,6 +1186,7 @@ static void cfq_put_cfqg(struct cfq_group *cfqg)
                 return;
         for_each_cfqg_st(cfqg, i, j, st)
                 BUG_ON(!RB_EMPTY_ROOT(&st->rb));
+       free_percpu(cfqg->blkg.stats_cpu);
         kfree(cfqg);
  }
  
@@ -1176,7 +1245,7 @@ void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg)
  }
  
  #else /* GROUP_IOSCHED */
-static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
+static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd)
  {
         return &cfqd->root_group;
  }
@@ -1210,7 +1279,6 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
         struct cfq_rb_root *service_tree;
         int left;
         int new_cfqq = 1;
-       int group_changed = 0;
  
         service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
                                                 cfqq_type(cfqq));
@@ -1281,7 +1349,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
         rb_link_node(&cfqq->rb_node, parent, p);
         rb_insert_color(&cfqq->rb_node, &service_tree->rb);
         service_tree->count++;
-       if ((add_front || !new_cfqq) && !group_changed)
+       if (add_front || !new_cfqq)
                 return;
         cfq_group_notify_queue_add(cfqd, cfqq->cfqg);
  }
@@ -2029,7 +2097,7 @@ cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
  
         WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
  
-       return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio));
+       return 2 * base_rq * (IOPRIO_BE_NR - cfqq->ioprio);
  }
  
  /*
@@ -2911,7 +2979,7 @@ cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync,
         struct cfq_group *cfqg;
  
  retry:
-       cfqg = cfq_get_cfqg(cfqd, 1);
+       cfqg = cfq_get_cfqg(cfqd);
         cic = cfq_cic_lookup(cfqd, ioc);
         /* cic always exists here */
         cfqq = cic_to_cfqq(cic, is_sync);
@@ -3815,15 +3883,11 @@ static void cfq_put_async_queues(struct cfq_data *cfqd)
                 cfq_put_queue(cfqd->async_idle_cfqq);
  }
  
-static void cfq_cfqd_free(struct rcu_head *head)
-{
-       kfree(container_of(head, struct cfq_data, rcu));
-}
-
  static void cfq_exit_queue(struct elevator_queue *e)
  {
         struct cfq_data *cfqd = e->elevator_data;
         struct request_queue *q = cfqd->queue;
+       bool wait = false;
  
         cfq_shutdown_timer_wq(cfqd);
  
@@ -3842,7 +3906,13 @@ static void cfq_exit_queue(struct elevator_queue *e)
  
         cfq_put_async_queues(cfqd);
         cfq_release_cfq_groups(cfqd);
-       cfq_blkiocg_del_blkio_group(&cfqd->root_group.blkg);
+
+       /*
+        * If there are groups which we could not unlink from blkcg list,
+        * wait for a rcu period for them to be freed.
+        */
+       if (cfqd->nr_blkcg_linked_grps)
+               wait = true;
  
         spin_unlock_irq(q->queue_lock);
  
@@ -3852,8 +3922,25 @@ static void cfq_exit_queue(struct elevator_queue *e)
         ida_remove(&cic_index_ida, cfqd->cic_index);
         spin_unlock(&cic_index_lock);
  
-       /* Wait for cfqg->blkg->key accessors to exit their grace periods. */
-       call_rcu(&cfqd->rcu, cfq_cfqd_free);
+       /*
+        * Wait for cfqg->blkg->key accessors to exit their grace periods.
+        * Do this wait only if there are other unlinked groups out
+        * there. This can happen if cgroup deletion path claimed the
+        * responsibility of cleaning up a group before queue cleanup code
+        * get to the group.
+        *
+        * Do not call synchronize_rcu() unconditionally as there are drivers
+        * which create/delete request queue hundreds of times during scan/boot
+        * and synchronize_rcu() can take significant time and slow down boot.
+        */
+       if (wait)
+               synchronize_rcu();
+
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
+       /* Free up per cpu stats for root group */
+       free_percpu(cfqd->root_group.blkg.stats_cpu);
+#endif
+       kfree(cfqd);
  }
  
  static int cfq_alloc_cic_index(void)
@@ -3886,8 +3973,12 @@ static void *cfq_init_queue(struct request_queue *q)
                 return NULL;
  
         cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
-       if (!cfqd)
+       if (!cfqd) {
+               spin_lock(&cic_index_lock);
+               ida_remove(&cic_index_ida, i);
+               spin_unlock(&cic_index_lock);
                 return NULL;
+       }
  
         /*
          * Don't need take queue_lock in the routine, since we are
@@ -3909,14 +4000,29 @@ static void *cfq_init_queue(struct request_queue *q)
  
  #ifdef CONFIG_CFQ_GROUP_IOSCHED
         /*
-        * Take a reference to root group which we never drop. This is just
-        * to make sure that cfq_put_cfqg() does not try to kfree root group
+        * Set root group reference to 2. One reference will be dropped when
+        * all groups on cfqd->cfqg_list are being deleted during queue exit.
+        * Other reference will remain there as we don't want to delete this
+        * group as it is statically allocated and gets destroyed when
+        * throtl_data goes away.
          */
-       cfqg->ref = 1;
+       cfqg->ref = 2;
+
+       if (blkio_alloc_blkg_stats(&cfqg->blkg)) {
+               kfree(cfqg);
+               kfree(cfqd);
+               return NULL;
+       }
+
         rcu_read_lock();
+
         cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg,
                                         (void *)cfqd, 0);
         rcu_read_unlock();
+       cfqd->nr_blkcg_linked_grps++;
+
+       /* Add group on cfqd->cfqg_list */
+       hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
  #endif
         /*
          * Not strictly needed (since RB_ROOT just clears the node and we
diff --git a/block/elevator.c b/block/elevator.c

index 45ca1e3..b0b38ce 100644 (file)
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -155,13 +155,8 @@ static struct elevator_type *elevator_get(const char *name)
  
         e = elevator_find(name);
         if (!e) {
-               char elv[ELV_NAME_MAX + strlen("-iosched")];
-
                 spin_unlock(&elv_list_lock);
-
-               snprintf(elv, sizeof(elv), "%s-iosched", name);
-
-               request_module("%s", elv);
+               request_module("%s-iosched", name);
                 spin_lock(&elv_list_lock);
                 e = elevator_find(name);
         }
@@ -421,8 +416,6 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq)
         struct list_head *entry;
         int stop_flags;
  
-       BUG_ON(rq->cmd_flags & REQ_ON_PLUG);
-
         if (q->last_merge == rq)
                 q->last_merge = NULL;
  
@@ -661,8 +654,6 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where)
  
         rq->q = q;
  
-       BUG_ON(rq->cmd_flags & REQ_ON_PLUG);
-
         if (rq->cmd_flags & REQ_SOFTBARRIER) {
                 /* barriers are scheduling boundary, update end_sector */
                 if (rq->cmd_type == REQ_TYPE_FS ||
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c

index 30ea95f..d51f979 100644 (file)
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -1089,21 +1089,21 @@ static int atapi_drain_needed(struct request *rq)
  static int ata_scsi_dev_config(struct scsi_device *sdev,
                                struct ata_device *dev)
  {
+       struct request_queue *q = sdev->request_queue;
+
         if (!ata_id_has_unload(dev->id))
                 dev->flags |= ATA_DFLAG_NO_UNLOAD;
  
         /* configure max sectors */
-       blk_queue_max_hw_sectors(sdev->request_queue, dev->max_sectors);
+       blk_queue_max_hw_sectors(q, dev->max_sectors);
  
         if (dev->class == ATA_DEV_ATAPI) {
-               struct request_queue *q = sdev->request_queue;
                 void *buf;
  
                 sdev->sector_size = ATA_SECT_SIZE;
  
                 /* set DMA padding */
-               blk_queue_update_dma_pad(sdev->request_queue,
-                                        ATA_DMA_PAD_SZ - 1);
+               blk_queue_update_dma_pad(q, ATA_DMA_PAD_SZ - 1);
  
                 /* configure draining */
                 buf = kmalloc(ATAPI_MAX_DRAIN, q->bounce_gfp | GFP_KERNEL);
@@ -1131,8 +1131,7 @@ static int ata_scsi_dev_config(struct scsi_device *sdev,
                         "sector_size=%u > PAGE_SIZE, PIO may malfunction\n",
                         sdev->sector_size);
  
-       blk_queue_update_dma_alignment(sdev->request_queue,
-                                      sdev->sector_size - 1);
+       blk_queue_update_dma_alignment(q, sdev->sector_size - 1);
  
         if (dev->flags & ATA_DFLAG_AN)
                 set_bit(SDEV_EVT_MEDIA_CHANGE, sdev->supported_events);
@@ -1145,6 +1144,8 @@ static int ata_scsi_dev_config(struct scsi_device *sdev,
                 scsi_adjust_queue_depth(sdev, MSG_SIMPLE_TAG, depth);
         }
  
+       blk_queue_flush_queueable(q, false);
+
         dev->sdev = sdev;
         return 0;
  }
diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c

index 8690e31..a0aabd9 100644 (file)
--- a/drivers/block/paride/pcd.c
+++ b/drivers/block/paride/pcd.c
@@ -320,6 +320,8 @@ static void pcd_init_units(void)
                 disk->first_minor = unit;
                 strcpy(disk->disk_name, cd->name);      /* umm... */
                 disk->fops = &pcd_bdops;
+               disk->flags = GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE;
+               disk->events = DISK_EVENT_MEDIA_CHANGE;
         }
  }
  
diff --git a/drivers/cdrom/viocd.c b/drivers/cdrom/viocd.c

index e427fbe..ae15a4d 100644 (file)
--- a/drivers/cdrom/viocd.c
+++ b/drivers/cdrom/viocd.c
@@ -625,7 +625,9 @@ static int viocd_probe(struct vio_dev *vdev, const struct vio_device_id *id)
         blk_queue_max_hw_sectors(q, 4096 / 512);
         gendisk->queue = q;
         gendisk->fops = &viocd_fops;
-       gendisk->flags = GENHD_FL_CD|GENHD_FL_REMOVABLE;
+       gendisk->flags = GENHD_FL_CD | GENHD_FL_REMOVABLE |
+                        GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE;
+       gendisk->events = DISK_EVENT_MEDIA_CHANGE;
         set_capacity(gendisk, 0);
         gendisk->private_data = d;
         d->viocd_disk = gendisk;
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c

index a5ec5a7..6e5123b 100644 (file)
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -1781,7 +1781,8 @@ static int ide_cd_probe(ide_drive_t *drive)
  
         ide_cd_read_toc(drive, &sense);
         g->fops = &idecd_ops;
-       g->flags |= GENHD_FL_REMOVABLE;
+       g->flags |= GENHD_FL_REMOVABLE | GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE;
+       g->events = DISK_EVENT_MEDIA_CHANGE;
         add_disk(g);
         return 0;
  
diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c

index 95019c7..4778e27 100644 (file)
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -636,7 +636,7 @@ static int sr_probe(struct device *dev)
         disk->first_minor = minor;
         sprintf(disk->disk_name, "sr%d", minor);
         disk->fops = &sr_bdops;
-       disk->flags = GENHD_FL_CD;
+       disk->flags = GENHD_FL_CD | GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE;
         disk->events = DISK_EVENT_MEDIA_CHANGE | DISK_EVENT_EJECT_REQUEST;
  
         blk_queue_rq_timeout(sdev->request_queue, SR_TIMEOUT);
diff --git a/fs/block_dev.c b/fs/block_dev.c

index bf9c7a7..1f2b199 100644 (file)
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1238,6 +1238,8 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
         res = __blkdev_get(bdev, mode, 0);
  
         if (whole) {
+               struct gendisk *disk = whole->bd_disk;
+
                 /* finish claiming */
                 mutex_lock(&bdev->bd_mutex);
                 spin_lock(&bdev_lock);
@@ -1264,15 +1266,16 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
                 spin_unlock(&bdev_lock);
  
                 /*
-                * Block event polling for write claims.  Any write
-                * holder makes the write_holder state stick until all
-                * are released.  This is good enough and tracking
-                * individual writeable reference is too fragile given
-                * the way @mode is used in blkdev_get/put().
+                * Block event polling for write claims if requested.  Any
+                * write holder makes the write_holder state stick until
+                * all are released.  This is good enough and tracking
+                * individual writeable reference is too fragile given the
+                * way @mode is used in blkdev_get/put().
                  */
-               if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder) {
+               if ((disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE) &&
+                   !res && (mode & FMODE_WRITE) && !bdev->bd_write_holder) {
                         bdev->bd_write_holder = true;
-                       disk_block_events(bdev->bd_disk);
+                       disk_block_events(disk);
                 }
  
                 mutex_unlock(&bdev->bd_mutex);
diff --git a/fs/partitions/check.c b/fs/partitions/check.c

index d545e97..8ed4d34 100644 (file)
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -255,7 +255,11 @@ ssize_t part_discard_alignment_show(struct device *dev,
                                    struct device_attribute *attr, char *buf)
  {
         struct hd_struct *p = dev_to_part(dev);
-       return sprintf(buf, "%u\n", p->discard_alignment);
+       struct gendisk *disk = dev_to_disk(dev);
+
+       return sprintf(buf, "%u\n",
+                       queue_limit_discard_alignment(&disk->queue->limits,
+                                                       p->start_sect));
  }
  
  ssize_t part_stat_show(struct device *dev,
@@ -449,8 +453,6 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
         p->start_sect = start;
         p->alignment_offset =
                 queue_limit_alignment_offset(&disk->queue->limits, start);
-       p->discard_alignment =
-               queue_limit_discard_alignment(&disk->queue->limits, start);
         p->nr_sects = len;
         p->partno = partno;
         p->policy = get_disk_ro(disk);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h

index be50d9e..2a7cea5 100644 (file)
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -151,7 +151,6 @@ enum rq_flag_bits {
         __REQ_IO_STAT,          /* account I/O stat */
         __REQ_MIXED_MERGE,      /* merge of different types, fail separately */
         __REQ_SECURE,           /* secure discard (used with __REQ_DISCARD) */
-       __REQ_ON_PLUG,          /* on plug list */
         __REQ_NR_BITS,          /* stops here */
  };
  
@@ -192,6 +191,5 @@ enum rq_flag_bits {
  #define REQ_IO_STAT            (1 << __REQ_IO_STAT)
  #define REQ_MIXED_MERGE                (1 << __REQ_MIXED_MERGE)
  #define REQ_SECURE             (1 << __REQ_SECURE)
-#define REQ_ON_PLUG            (1 << __REQ_ON_PLUG)
  
  #endif /* __LINUX_BLK_TYPES_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h

index 2ad95fa..ae9091a 100644 (file)
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -257,7 +257,7 @@ struct queue_limits {
         unsigned char           misaligned;
         unsigned char           discard_misaligned;
         unsigned char           cluster;
-       signed char             discard_zeroes_data;
+       unsigned char           discard_zeroes_data;
  };
  
  struct request_queue
@@ -364,6 +364,8 @@ struct request_queue
          * for flush operations
          */
         unsigned int            flush_flags;
+       unsigned int            flush_not_queueable:1;
+       unsigned int            flush_queue_delayed:1;
         unsigned int            flush_pending_idx:1;
         unsigned int            flush_running_idx:1;
         unsigned long           flush_pending_since;
@@ -843,6 +845,7 @@ extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *);
  extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *);
  extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);
  extern void blk_queue_flush(struct request_queue *q, unsigned int flush);
+extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable);
  extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
  
  extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *);
@@ -1066,13 +1069,16 @@ static inline int queue_limit_discard_alignment(struct queue_limits *lim, sector
  {
         unsigned int alignment = (sector << 9) & (lim->discard_granularity - 1);
  
+       if (!lim->max_discard_sectors)
+               return 0;
+
         return (lim->discard_granularity + lim->discard_alignment - alignment)
                 & (lim->discard_granularity - 1);
  }
  
  static inline unsigned int queue_discard_zeroes_data(struct request_queue *q)
  {
-       if (q->limits.discard_zeroes_data == 1)
+       if (q->limits.max_discard_sectors && q->limits.discard_zeroes_data == 1)
                 return 1;
  
         return 0;
@@ -1111,6 +1117,11 @@ static inline unsigned int block_size(struct block_device *bdev)
         return bdev->bd_block_size;
  }
  
+static inline bool queue_flush_queueable(struct request_queue *q)
+{
+       return !q->flush_not_queueable;
+}
+
  typedef struct {struct page *v;} Sector;
  
  unsigned char *read_dev_sector(struct block_device *, sector_t, Sector *);
diff --git a/include/linux/genhd.h b/include/linux/genhd.h

index d764a42..b78956b 100644 (file)
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -100,7 +100,6 @@ struct hd_struct {
         sector_t start_sect;
         sector_t nr_sects;
         sector_t alignment_offset;
-       unsigned int discard_alignment;
         struct device __dev;
         struct kobject *holder_dir;
         int policy, partno;
@@ -127,6 +126,7 @@ struct hd_struct {
  #define GENHD_FL_SUPPRESS_PARTITION_INFO       32
  #define GENHD_FL_EXT_DEVT                      64 /* allow extended devt */
  #define GENHD_FL_NATIVE_CAPACITY               128
+#define GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE    256
  
  enum {
         DISK_EVENT_MEDIA_CHANGE                 = 1 << 0, /* media changed */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c

index befc875..f032e6e 100644 (file)
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -63,10 +63,10 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
         unsigned long background_thresh;
         unsigned long dirty_thresh;
         unsigned long bdi_thresh;
-       unsigned long nr_dirty, nr_io, nr_more_io, nr_wb;
+       unsigned long nr_dirty, nr_io, nr_more_io;
         struct inode *inode;
  
-       nr_wb = nr_dirty = nr_io = nr_more_io = 0;
+       nr_dirty = nr_io = nr_more_io = 0;
         spin_lock(&inode_wb_list_lock);
         list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
                 nr_dirty++;
author	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 25 May 2011 16:14:07 +0000 (09:14 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 25 May 2011 16:14:07 +0000 (09:14 -0700)
Documentation/ABI/testing/sysfs-block		patch \| blob \| history
block/blk-cgroup.c		patch \| blob \| history
block/blk-cgroup.h		patch \| blob \| history
block/blk-core.c		patch \| blob \| history
block/blk-exec.c		patch \| blob \| history
block/blk-flush.c		patch \| blob \| history
block/blk-ioc.c		patch \| blob \| history
block/blk-lib.c		patch \| blob \| history
block/blk-settings.c		patch \| blob \| history
block/blk-sysfs.c		patch \| blob \| history
block/blk-throttle.c		patch \| blob \| history
block/blk.h		patch \| blob \| history
block/cfq-iosched.c		patch \| blob \| history
block/elevator.c		patch \| blob \| history
drivers/ata/libata-scsi.c		patch \| blob \| history
drivers/block/paride/pcd.c		patch \| blob \| history
drivers/cdrom/viocd.c		patch \| blob \| history
drivers/ide/ide-cd.c		patch \| blob \| history
drivers/scsi/sr.c		patch \| blob \| history
fs/block_dev.c		patch \| blob \| history
fs/partitions/check.c		patch \| blob \| history
include/linux/blk_types.h		patch \| blob \| history
include/linux/blkdev.h		patch \| blob \| history
include/linux/genhd.h		patch \| blob \| history
mm/backing-dev.c		patch \| blob \| history