Merge branch 'for-linus' of git://git.kernel.dk/linux-block

author Linus Torvalds <torvalds@linux-foundation.org>

Sat, 25 Aug 2012 18:36:43 +0000 (11:36 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Sat, 25 Aug 2012 18:36:43 +0000 (11:36 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Sat, 25 Aug 2012 18:36:43 +0000 (11:36 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Sat, 25 Aug 2012 18:36:43 +0000 (11:36 -0700)
diff --git a/Documentation/block/00-INDEX b/Documentation/block/00-INDEX

index d111e3b..d18ecd8 100644 (file)
--- a/Documentation/block/00-INDEX
+++ b/Documentation/block/00-INDEX
@@ -3,15 +3,21 @@
  biodoc.txt
         - Notes on the Generic Block Layer Rewrite in Linux 2.5
  capability.txt
-       - Generic Block Device Capability (/sys/block/<disk>/capability)
+       - Generic Block Device Capability (/sys/block/<device>/capability)
+cfq-iosched.txt
+       - CFQ IO scheduler tunables
+data-integrity.txt
+       - Block data integrity
  deadline-iosched.txt
         - Deadline IO scheduler tunables
  ioprio.txt
         - Block io priorities (in CFQ scheduler)
+queue-sysfs.txt
+       - Queue's sysfs entries
  request.txt
         - The members of struct request (in include/linux/blkdev.h)
  stat.txt
-       - Block layer statistics in /sys/block/<dev>/stat
+       - Block layer statistics in /sys/block/<device>/stat
  switching-sched.txt
         - Switching I/O schedulers at runtime
  writeback_cache_control.txt
diff --git a/Documentation/block/cfq-iosched.txt b/Documentation/block/cfq-iosched.txt

index 6d670f5..d89b4fe 100644 (file)
--- a/Documentation/block/cfq-iosched.txt
+++ b/Documentation/block/cfq-iosched.txt
@@ -1,3 +1,14 @@
+CFQ (Complete Fairness Queueing)
+===============================
+
+The main aim of CFQ scheduler is to provide a fair allocation of the disk
+I/O bandwidth for all the processes which requests an I/O operation.
+
+CFQ maintains the per process queue for the processes which request I/O
+operation(syncronous requests). In case of asynchronous requests, all the
+requests from all the processes are batched together according to their
+process's I/O priority.
+
  CFQ ioscheduler tunables
  ========================
  
@@ -25,6 +36,72 @@ there are multiple spindles behind single LUN (Host based hardware RAID
  controller or for storage arrays), setting slice_idle=0 might end up in better
  throughput and acceptable latencies.
  
+back_seek_max
+-------------
+This specifies, given in Kbytes, the maximum "distance" for backward seeking.
+The distance is the amount of space from the current head location to the
+sectors that are backward in terms of distance.
+
+This parameter allows the scheduler to anticipate requests in the "backward"
+direction and consider them as being the "next" if they are within this
+distance from the current head location.
+
+back_seek_penalty
+-----------------
+This parameter is used to compute the cost of backward seeking. If the
+backward distance of request is just 1/back_seek_penalty from a "front"
+request, then the seeking cost of two requests is considered equivalent.
+
+So scheduler will not bias toward one or the other request (otherwise scheduler
+will bias toward front request). Default value of back_seek_penalty is 2.
+
+fifo_expire_async
+-----------------
+This parameter is used to set the timeout of asynchronous requests. Default
+value of this is 248ms.
+
+fifo_expire_sync
+----------------
+This parameter is used to set the timeout of synchronous requests. Default
+value of this is 124ms. In case to favor synchronous requests over asynchronous
+one, this value should be decreased relative to fifo_expire_async.
+
+slice_async
+-----------
+This parameter is same as of slice_sync but for asynchronous queue. The
+default value is 40ms.
+
+slice_async_rq
+--------------
+This parameter is used to limit the dispatching of asynchronous request to
+device request queue in queue's slice time. The maximum number of request that
+are allowed to be dispatched also depends upon the io priority. Default value
+for this is 2.
+
+slice_sync
+----------
+When a queue is selected for execution, the queues IO requests are only
+executed for a certain amount of time(time_slice) before switching to another
+queue. This parameter is used to calculate the time slice of synchronous
+queue.
+
+time_slice is computed using the below equation:-
+time_slice = slice_sync + (slice_sync/5 * (4 - prio)). To increase the
+time_slice of synchronous queue, increase the value of slice_sync. Default
+value is 100ms.
+
+quantum
+-------
+This specifies the number of request dispatched to the device queue. In a
+queue's time slice, a request will not be dispatched if the number of request
+in the device exceeds this parameter. This parameter is used for synchronous
+request.
+
+In case of storage with several disk, this setting can limit the parallel
+processing of request. Therefore, increasing the value can imporve the
+performace although this can cause the latency of some I/O to increase due
+to more number of requests.
+
  CFQ IOPS Mode for group scheduling
  ===================================
  Basic CFQ design is to provide priority based time slices. Higher priority
diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt

index 6518a55..e54ac1d 100644 (file)
--- a/Documentation/block/queue-sysfs.txt
+++ b/Documentation/block/queue-sysfs.txt
@@ -9,20 +9,71 @@ These files are the ones found in the /sys/block/xxx/queue/ directory.
  Files denoted with a RO postfix are readonly and the RW postfix means
  read-write.
  
+add_random (RW)
+----------------
+This file allows to trun off the disk entropy contribution. Default
+value of this file is '1'(on).
+
+discard_granularity (RO)
+-----------------------
+This shows the size of internal allocation of the device in bytes, if
+reported by the device. A value of '0' means device does not support
+the discard functionality.
+
+discard_max_bytes (RO)
+----------------------
+Devices that support discard functionality may have internal limits on
+the number of bytes that can be trimmed or unmapped in a single operation.
+The discard_max_bytes parameter is set by the device driver to the maximum
+number of bytes that can be discarded in a single operation. Discard
+requests issued to the device must not exceed this limit. A discard_max_bytes
+value of 0 means that the device does not support discard functionality.
+
+discard_zeroes_data (RO)
+------------------------
+When read, this file will show if the discarded block are zeroed by the
+device or not. If its value is '1' the blocks are zeroed otherwise not.
+
  hw_sector_size (RO)
  -------------------
  This is the hardware sector size of the device, in bytes.
  
+iostats (RW)
+-------------
+This file is used to control (on/off) the iostats accounting of the
+disk.
+
+logical_block_size (RO)
+-----------------------
+This is the logcal block size of the device, in bytes.
+
  max_hw_sectors_kb (RO)
  ----------------------
  This is the maximum number of kilobytes supported in a single data transfer.
  
+max_integrity_segments (RO)
+---------------------------
+When read, this file shows the max limit of integrity segments as
+set by block layer which a hardware controller can handle.
+
  max_sectors_kb (RW)
  -------------------
  This is the maximum number of kilobytes that the block layer will allow
  for a filesystem request. Must be smaller than or equal to the maximum
  size allowed by the hardware.
  
+max_segments (RO)
+-----------------
+Maximum number of segments of the device.
+
+max_segment_size (RO)
+---------------------
+Maximum segment size of the device.
+
+minimum_io_size (RO)
+--------------------
+This is the smallest preferred io size reported by the device.
+
  nomerges (RW)
  -------------
  This enables the user to disable the lookup logic involved with IO
@@ -45,11 +96,24 @@ per-block-cgroup request pool.  IOW, if there are N block cgroups,
  each request queue may have upto N request pools, each independently
  regulated by nr_requests.
  
+optimal_io_size (RO)
+--------------------
+This is the optimal io size reported by the device.
+
+physical_block_size (RO)
+------------------------
+This is the physical block size of device, in bytes.
+
  read_ahead_kb (RW)
  ------------------
  Maximum number of kilobytes to read-ahead for filesystems on this block
  device.
  
+rotational (RW)
+---------------
+This file is used to stat if the device is of rotational type or
+non-rotational type.
+
  rq_affinity (RW)
  ----------------
  If this option is '1', the block layer will migrate request completions to the
diff --git a/block/blk-lib.c b/block/blk-lib.c

index 2b461b4..19cc761 100644 (file)
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -44,6 +44,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
         struct request_queue *q = bdev_get_queue(bdev);
         int type = REQ_WRITE | REQ_DISCARD;
         unsigned int max_discard_sectors;
+       unsigned int granularity, alignment, mask;
         struct bio_batch bb;
         struct bio *bio;
         int ret = 0;
@@ -54,18 +55,20 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
         if (!blk_queue_discard(q))
                 return -EOPNOTSUPP;
  
+       /* Zero-sector (unknown) and one-sector granularities are the same.  */
+       granularity = max(q->limits.discard_granularity >> 9, 1U);
+       mask = granularity - 1;
+       alignment = (bdev_discard_alignment(bdev) >> 9) & mask;
+
         /*
          * Ensure that max_discard_sectors is of the proper
-        * granularity
+        * granularity, so that requests stay aligned after a split.
          */
         max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9);
+       max_discard_sectors = round_down(max_discard_sectors, granularity);
         if (unlikely(!max_discard_sectors)) {
                 /* Avoid infinite loop below. Being cautious never hurts. */
                 return -EOPNOTSUPP;
-       } else if (q->limits.discard_granularity) {
-               unsigned int disc_sects = q->limits.discard_granularity >> 9;
-
-               max_discard_sectors &= ~(disc_sects - 1);
         }
  
         if (flags & BLKDEV_DISCARD_SECURE) {
@@ -79,25 +82,37 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
         bb.wait = &wait;
  
         while (nr_sects) {
+               unsigned int req_sects;
+               sector_t end_sect;
+
                 bio = bio_alloc(gfp_mask, 1);
                 if (!bio) {
                         ret = -ENOMEM;
                         break;
                 }
  
+               req_sects = min_t(sector_t, nr_sects, max_discard_sectors);
+
+               /*
+                * If splitting a request, and the next starting sector would be
+                * misaligned, stop the discard at the previous aligned sector.
+                */
+               end_sect = sector + req_sects;
+               if (req_sects < nr_sects && (end_sect & mask) != alignment) {
+                       end_sect =
+                               round_down(end_sect - alignment, granularity)
+                               + alignment;
+                       req_sects = end_sect - sector;
+               }
+
                 bio->bi_sector = sector;
                 bio->bi_end_io = bio_batch_end_io;
                 bio->bi_bdev = bdev;
                 bio->bi_private = &bb;
  
-               if (nr_sects > max_discard_sectors) {
-                       bio->bi_size = max_discard_sectors << 9;
-                       nr_sects -= max_discard_sectors;
-                       sector += max_discard_sectors;
-               } else {
-                       bio->bi_size = nr_sects << 9;
-                       nr_sects = 0;
-               }
+               bio->bi_size = req_sects << 9;
+               nr_sects -= req_sects;
+               sector = end_sect;
  
                 atomic_inc(&bb.done);
                 submit_bio(type, bio);
diff --git a/block/blk-merge.c b/block/blk-merge.c

index 160035f..e76279e 100644 (file)
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -110,6 +110,49 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
         return 0;
  }
  
+static void
+__blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec,
+                    struct scatterlist *sglist, struct bio_vec **bvprv,
+                    struct scatterlist **sg, int *nsegs, int *cluster)
+{
+
+       int nbytes = bvec->bv_len;
+
+       if (*bvprv && *cluster) {
+               if ((*sg)->length + nbytes > queue_max_segment_size(q))
+                       goto new_segment;
+
+               if (!BIOVEC_PHYS_MERGEABLE(*bvprv, bvec))
+                       goto new_segment;
+               if (!BIOVEC_SEG_BOUNDARY(q, *bvprv, bvec))
+                       goto new_segment;
+
+               (*sg)->length += nbytes;
+       } else {
+new_segment:
+               if (!*sg)
+                       *sg = sglist;
+               else {
+                       /*
+                        * If the driver previously mapped a shorter
+                        * list, we could see a termination bit
+                        * prematurely unless it fully inits the sg
+                        * table on each mapping. We KNOW that there
+                        * must be more entries here or the driver
+                        * would be buggy, so force clear the
+                        * termination bit to avoid doing a full
+                        * sg_init_table() in drivers for each command.
+                        */
+                       (*sg)->page_link &= ~0x02;
+                       *sg = sg_next(*sg);
+               }
+
+               sg_set_page(*sg, bvec->bv_page, nbytes, bvec->bv_offset);
+               (*nsegs)++;
+       }
+       *bvprv = bvec;
+}
+
  /*
   * map a request to scatterlist, return number of sg entries setup. Caller
   * must make sure sg can hold rq->nr_phys_segments entries
@@ -131,41 +174,8 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq,
         bvprv = NULL;
         sg = NULL;
         rq_for_each_segment(bvec, rq, iter) {
-               int nbytes = bvec->bv_len;
-
-               if (bvprv && cluster) {
-                       if (sg->length + nbytes > queue_max_segment_size(q))
-                               goto new_segment;
-
-                       if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec))
-                               goto new_segment;
-                       if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec))
-                               goto new_segment;
-
-                       sg->length += nbytes;
-               } else {
-new_segment:
-                       if (!sg)
-                               sg = sglist;
-                       else {
-                               /*
-                                * If the driver previously mapped a shorter
-                                * list, we could see a termination bit
-                                * prematurely unless it fully inits the sg
-                                * table on each mapping. We KNOW that there
-                                * must be more entries here or the driver
-                                * would be buggy, so force clear the
-                                * termination bit to avoid doing a full
-                                * sg_init_table() in drivers for each command.
-                                */
-                               sg->page_link &= ~0x02;
-                               sg = sg_next(sg);
-                       }
-
-                       sg_set_page(sg, bvec->bv_page, nbytes, bvec->bv_offset);
-                       nsegs++;
-               }
-               bvprv = bvec;
+               __blk_segment_map_sg(q, bvec, sglist, &bvprv, &sg,
+                                    &nsegs, &cluster);
         } /* segments in rq */
  
  
@@ -199,6 +209,43 @@ new_segment:
  }
  EXPORT_SYMBOL(blk_rq_map_sg);
  
+/**
+ * blk_bio_map_sg - map a bio to a scatterlist
+ * @q: request_queue in question
+ * @bio: bio being mapped
+ * @sglist: scatterlist being mapped
+ *
+ * Note:
+ *    Caller must make sure sg can hold bio->bi_phys_segments entries
+ *
+ * Will return the number of sg entries setup
+ */
+int blk_bio_map_sg(struct request_queue *q, struct bio *bio,
+                  struct scatterlist *sglist)
+{
+       struct bio_vec *bvec, *bvprv;
+       struct scatterlist *sg;
+       int nsegs, cluster;
+       unsigned long i;
+
+       nsegs = 0;
+       cluster = blk_queue_cluster(q);
+
+       bvprv = NULL;
+       sg = NULL;
+       bio_for_each_segment(bvec, bio, i) {
+               __blk_segment_map_sg(q, bvec, sglist, &bvprv, &sg,
+                                    &nsegs, &cluster);
+       } /* segments in bio */
+
+       if (sg)
+               sg_mark_end(sg);
+
+       BUG_ON(bio->bi_phys_segments && nsegs > bio->bi_phys_segments);
+       return nsegs;
+}
+EXPORT_SYMBOL(blk_bio_map_sg);
+
  static inline int ll_new_hw_segment(struct request_queue *q,
                                     struct request *req,
                                     struct bio *bio)
diff --git a/block/genhd.c b/block/genhd.c

index cac7366..d839723 100644 (file)
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -835,7 +835,7 @@ static void disk_seqf_stop(struct seq_file *seqf, void *v)
  
  static void *show_partition_start(struct seq_file *seqf, loff_t *pos)
  {
-       static void *p;
+       void *p;
  
         p = disk_seqf_start(seqf, pos);
         if (!IS_ERR_OR_NULL(p) && !*pos)
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c

index ba91b40..d845664 100644 (file)
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -889,6 +889,7 @@ struct bm_aio_ctx {
         unsigned int done;
         unsigned flags;
  #define BM_AIO_COPY_PAGES      1
+#define BM_WRITE_ALL_PAGES     2
         int error;
         struct kref kref;
  };
@@ -1059,7 +1060,8 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_w
                 if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx)
                         break;
                 if (rw & WRITE) {
-                       if (bm_test_page_unchanged(b->bm_pages[i])) {
+                       if (!(flags & BM_WRITE_ALL_PAGES) &&
+                           bm_test_page_unchanged(b->bm_pages[i])) {
                                 dynamic_dev_dbg(DEV, "skipped bm write for idx %u\n", i);
                                 continue;
                         }
@@ -1141,6 +1143,17 @@ int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local)
  }
  
  /**
+ * drbd_bm_write_all() - Write the whole bitmap to its on disk location.
+ * @mdev:      DRBD device.
+ *
+ * Will write all pages.
+ */
+int drbd_bm_write_all(struct drbd_conf *mdev) __must_hold(local)
+{
+       return bm_rw(mdev, WRITE, BM_WRITE_ALL_PAGES, 0);
+}
+
+/**
   * drbd_bm_lazy_write_out() - Write bitmap pages 0 to @upper_idx-1, if they have changed.
   * @mdev:      DRBD device.
   * @upper_idx: 0: write all changed pages; +ve: page index to stop scanning for changed pages
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h

index b2ca143..b953cc7 100644 (file)
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1469,6 +1469,7 @@ extern int  drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr);
  extern int  drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local);
  extern int  drbd_bm_read(struct drbd_conf *mdev) __must_hold(local);
  extern int  drbd_bm_write(struct drbd_conf *mdev) __must_hold(local);
+extern int drbd_bm_write_all(struct drbd_conf *mdev) __must_hold(local);
  extern int  drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local);
  extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev,
                 unsigned long al_enr);
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c

index dbe6135..f93a032 100644 (file)
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -79,6 +79,7 @@ static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
  static void md_sync_timer_fn(unsigned long data);
  static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
  static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused);
+static void _tl_clear(struct drbd_conf *mdev);
  
  MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
               "Lars Ellenberg <lars@linbit.com>");
@@ -432,19 +433,10 @@ static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
  
         /* Actions operating on the disk state, also want to work on
            requests that got barrier acked. */
-       switch (what) {
-       case fail_frozen_disk_io:
-       case restart_frozen_disk_io:
-               list_for_each_safe(le, tle, &mdev->barrier_acked_requests) {
-                       req = list_entry(le, struct drbd_request, tl_requests);
-                       _req_mod(req, what);
-               }
  
-       case connection_lost_while_pending:
-       case resend:
-               break;
-       default:
-               dev_err(DEV, "what = %d in _tl_restart()\n", what);
+       list_for_each_safe(le, tle, &mdev->barrier_acked_requests) {
+               req = list_entry(le, struct drbd_request, tl_requests);
+               _req_mod(req, what);
         }
  }
  
@@ -459,11 +451,16 @@ static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
   */
  void tl_clear(struct drbd_conf *mdev)
  {
+       spin_lock_irq(&mdev->req_lock);
+       _tl_clear(mdev);
+       spin_unlock_irq(&mdev->req_lock);
+}
+
+static void _tl_clear(struct drbd_conf *mdev)
+{
         struct list_head *le, *tle;
         struct drbd_request *r;
  
-       spin_lock_irq(&mdev->req_lock);
-
         _tl_restart(mdev, connection_lost_while_pending);
  
         /* we expect this list to be empty. */
@@ -482,7 +479,6 @@ void tl_clear(struct drbd_conf *mdev)
  
         memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
  
-       spin_unlock_irq(&mdev->req_lock);
  }
  
  void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
@@ -1476,12 +1472,12 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
         if (ns.susp_fen) {
                 /* case1: The outdate peer handler is successful: */
                 if (os.pdsk > D_OUTDATED  && ns.pdsk <= D_OUTDATED) {
-                       tl_clear(mdev);
                         if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
                                 drbd_uuid_new_current(mdev);
                                 clear_bit(NEW_CUR_UUID, &mdev->flags);
                         }
                         spin_lock_irq(&mdev->req_lock);
+                       _tl_clear(mdev);
                         _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
                         spin_unlock_irq(&mdev->req_lock);
                 }
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c

index fb9dce8..edb490a 100644 (file)
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -674,8 +674,8 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds
                          la_size_changed && md_moved ? "size changed and md moved" :
                          la_size_changed ? "size changed" : "md moved");
                 /* next line implicitly does drbd_suspend_io()+drbd_resume_io() */
-               err = drbd_bitmap_io(mdev, &drbd_bm_write,
-                               "size changed", BM_LOCKED_MASK);
+               err = drbd_bitmap_io(mdev, md_moved ? &drbd_bm_write_all : &drbd_bm_write,
+                                    "size changed", BM_LOCKED_MASK);
                 if (err) {
                         rv = dev_size_error;
                         goto out;
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c

index 910335c..01b2ac6 100644 (file)
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -695,6 +695,12 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
                 break;
  
         case resend:
+               /* Simply complete (local only) READs. */
+               if (!(req->rq_state & RQ_WRITE) && !req->w.cb) {
+                       _req_may_be_done(req, m);
+                       break;
+               }
+
                 /* If RQ_NET_OK is already set, we got a P_WRITE_ACK or P_RECV_ACK
                    before the connection loss (B&C only); only P_BARRIER_ACK was missing.
                    Trowing them out of the TL here by pretending we got a BARRIER_ACK
@@ -834,7 +840,15 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns
                 req->private_bio = NULL;
         }
         if (rw == WRITE) {
-               remote = 1;
+               /* Need to replicate writes.  Unless it is an empty flush,
+                * which is better mapped to a DRBD P_BARRIER packet,
+                * also for drbd wire protocol compatibility reasons. */
+               if (unlikely(size == 0)) {
+                       /* The only size==0 bios we expect are empty flushes. */
+                       D_ASSERT(bio->bi_rw & REQ_FLUSH);
+                       remote = 0;
+               } else
+                       remote = 1;
         } else {
                 /* READ || READA */
                 if (local) {
@@ -870,8 +884,11 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns
          * extent.  This waits for any resync activity in the corresponding
          * resync extent to finish, and, if necessary, pulls in the target
          * extent into the activity log, which involves further disk io because
-        * of transactional on-disk meta data updates. */
-       if (rw == WRITE && local && !test_bit(AL_SUSPENDED, &mdev->flags)) {
+        * of transactional on-disk meta data updates.
+        * Empty flushes don't need to go into the activity log, they can only
+        * flush data for pending writes which are already in there. */
+       if (rw == WRITE && local && size
+       && !test_bit(AL_SUSPENDED, &mdev->flags)) {
                 req->rq_state |= RQ_IN_ACT_LOG;
                 drbd_al_begin_io(mdev, sector);
         }
@@ -994,7 +1011,10 @@ allocate_barrier:
         if (rw == WRITE && _req_conflicts(req))
                 goto fail_conflicting;
  
-       list_add_tail(&req->tl_requests, &mdev->newest_tle->requests);
+       /* no point in adding empty flushes to the transfer log,
+        * they are mapped to drbd barriers already. */
+       if (likely(size!=0))
+               list_add_tail(&req->tl_requests, &mdev->newest_tle->requests);
  
         /* NOTE remote first: to get the concurrent write detection right,
          * we must register the request before start of local IO.  */
@@ -1014,6 +1034,14 @@ allocate_barrier:
             mdev->net_conf->on_congestion != OC_BLOCK && mdev->agreed_pro_version >= 96)
                 maybe_pull_ahead(mdev);
  
+       /* If this was a flush, queue a drbd barrier/start a new epoch.
+        * Unless the current epoch was empty anyways, or we are not currently
+        * replicating, in which case there is no point. */
+       if (unlikely(bio->bi_rw & REQ_FLUSH)
+               && mdev->newest_tle->n_writes
+               && drbd_should_do_remote(mdev->state))
+               queue_barrier(mdev);
+
         spin_unlock_irq(&mdev->req_lock);
         kfree(b); /* if someone else has beaten us to it... */
  
diff --git a/fs/bio.c b/fs/bio.c

index 5eaa70c..71072ab 100644 (file)
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -73,7 +73,7 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
  {
         unsigned int sz = sizeof(struct bio) + extra_size;
         struct kmem_cache *slab = NULL;
-       struct bio_slab *bslab;
+       struct bio_slab *bslab, *new_bio_slabs;
         unsigned int i, entry = -1;
  
         mutex_lock(&bio_slab_lock);
@@ -97,11 +97,12 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
  
         if (bio_slab_nr == bio_slab_max && entry == -1) {
                 bio_slab_max <<= 1;
-               bio_slabs = krealloc(bio_slabs,
-                                    bio_slab_max * sizeof(struct bio_slab),
-                                    GFP_KERNEL);
-               if (!bio_slabs)
+               new_bio_slabs = krealloc(bio_slabs,
+                                        bio_slab_max * sizeof(struct bio_slab),
+                                        GFP_KERNEL);
+               if (!new_bio_slabs)
                         goto out_unlock;
+               bio_slabs = new_bio_slabs;
         }
         if (entry == -1)
                 entry = bio_slab_nr++;
diff --git a/fs/block_dev.c b/fs/block_dev.c

index 1e51919..38e721b 100644 (file)
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1578,10 +1578,12 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
                          unsigned long nr_segs, loff_t pos)
  {
         struct file *file = iocb->ki_filp;
+       struct blk_plug plug;
         ssize_t ret;
  
         BUG_ON(iocb->ki_pos != pos);
  
+       blk_start_plug(&plug);
         ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
         if (ret > 0 || ret == -EIOCBQUEUED) {
                 ssize_t err;
@@ -1590,6 +1592,7 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
                 if (err < 0 && ret > 0)
                         ret = err;
         }
+       blk_finish_plug(&plug);
         return ret;
  }
  EXPORT_SYMBOL_GPL(blkdev_aio_write);
diff --git a/fs/buffer.c b/fs/buffer.c

index 9f6d2e4..58e2e7b 100644 (file)
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -914,7 +914,7 @@ link_dev_buffers(struct page *page, struct buffer_head *head)
  /*
   * Initialise the state of a blockdev page's buffers.
   */ 
-static void
+static sector_t
  init_page_buffers(struct page *page, struct block_device *bdev,
                         sector_t block, int size)
  {
@@ -936,33 +936,41 @@ init_page_buffers(struct page *page, struct block_device *bdev,
                 block++;
                 bh = bh->b_this_page;
         } while (bh != head);
+
+       /*
+        * Caller needs to validate requested block against end of device.
+        */
+       return end_block;
  }
  
  /*
   * Create the page-cache page that contains the requested block.
   *
- * This is user purely for blockdev mappings.
+ * This is used purely for blockdev mappings.
   */
-static struct page *
+static int
  grow_dev_page(struct block_device *bdev, sector_t block,
-               pgoff_t index, int size)
+               pgoff_t index, int size, int sizebits)
  {
         struct inode *inode = bdev->bd_inode;
         struct page *page;
         struct buffer_head *bh;
+       sector_t end_block;
+       int ret = 0;            /* Will call free_more_memory() */
  
         page = find_or_create_page(inode->i_mapping, index,
                 (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
         if (!page)
-               return NULL;
+               return ret;
  
         BUG_ON(!PageLocked(page));
  
         if (page_has_buffers(page)) {
                 bh = page_buffers(page);
                 if (bh->b_size == size) {
-                       init_page_buffers(page, bdev, block, size);
-                       return page;
+                       end_block = init_page_buffers(page, bdev,
+                                               index << sizebits, size);
+                       goto done;
                 }
                 if (!try_to_free_buffers(page))
                         goto failed;
@@ -982,14 +990,14 @@ grow_dev_page(struct block_device *bdev, sector_t block,
          */
         spin_lock(&inode->i_mapping->private_lock);
         link_dev_buffers(page, bh);
-       init_page_buffers(page, bdev, block, size);
+       end_block = init_page_buffers(page, bdev, index << sizebits, size);
         spin_unlock(&inode->i_mapping->private_lock);
-       return page;
-
+done:
+       ret = (block < end_block) ? 1 : -ENXIO;
  failed:
         unlock_page(page);
         page_cache_release(page);
-       return NULL;
+       return ret;
  }
  
  /*
@@ -999,7 +1007,6 @@ failed:
  static int
  grow_buffers(struct block_device *bdev, sector_t block, int size)
  {
-       struct page *page;
         pgoff_t index;
         int sizebits;
  
@@ -1023,22 +1030,14 @@ grow_buffers(struct block_device *bdev, sector_t block, int size)
                         bdevname(bdev, b));
                 return -EIO;
         }
-       block = index << sizebits;
+
         /* Create a page with the proper size buffers.. */
-       page = grow_dev_page(bdev, block, index, size);
-       if (!page)
-               return 0;
-       unlock_page(page);
-       page_cache_release(page);
-       return 1;
+       return grow_dev_page(bdev, block, index, size, sizebits);
  }
  
  static struct buffer_head *
  __getblk_slow(struct block_device *bdev, sector_t block, int size)
  {
-       int ret;
-       struct buffer_head *bh;
-
         /* Size must be multiple of hard sectorsize */
         if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
                         (size < 512 || size > PAGE_SIZE))) {
@@ -1051,21 +1050,20 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
                 return NULL;
         }
  
-retry:
-       bh = __find_get_block(bdev, block, size);
-       if (bh)
-               return bh;
+       for (;;) {
+               struct buffer_head *bh;
+               int ret;
  
-       ret = grow_buffers(bdev, block, size);
-       if (ret == 0) {
-               free_more_memory();
-               goto retry;
-       } else if (ret > 0) {
                 bh = __find_get_block(bdev, block, size);
                 if (bh)
                         return bh;
+
+               ret = grow_buffers(bdev, block, size);
+               if (ret < 0)
+                       return NULL;
+               if (ret == 0)
+                       free_more_memory();
         }
-       return NULL;
  }
  
  /*
@@ -1321,10 +1319,6 @@ EXPORT_SYMBOL(__find_get_block);
   * which corresponds to the passed block_device, block and size. The
   * returned buffer has its reference count incremented.
   *
- * __getblk() cannot fail - it just keeps trying.  If you pass it an
- * illegal block number, __getblk() will happily return a buffer_head
- * which represents the non-existent block.  Very weird.
- *
   * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
   * attempt is failing.  FIXME, perhaps?
   */
diff --git a/fs/direct-io.c b/fs/direct-io.c

index 1faf4cb..f86c720 100644 (file)
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1062,6 +1062,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
         unsigned long user_addr;
         size_t bytes;
         struct buffer_head map_bh = { 0, };
+       struct blk_plug plug;
  
         if (rw & WRITE)
                 rw = WRITE_ODIRECT;
@@ -1177,6 +1178,8 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
                                 PAGE_SIZE - user_addr / PAGE_SIZE);
         }
  
+       blk_start_plug(&plug);
+
         for (seg = 0; seg < nr_segs; seg++) {
                 user_addr = (unsigned long)iov[seg].iov_base;
                 sdio.size += bytes = iov[seg].iov_len;
@@ -1235,6 +1238,8 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
         if (sdio.bio)
                 dio_bio_submit(dio, &sdio);
  
+       blk_finish_plug(&plug);
+
         /*
          * It is possible that, we return short IO due to end of file.
          * In that case, we need to release all the pages we got hold on.
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h

index 4e72a9d..4a2ab7c 100644 (file)
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -601,7 +601,7 @@ static inline void blk_clear_rl_full(struct request_list *rl, bool sync)
   * it already be started by driver.
   */
  #define RQ_NOMERGE_FLAGS       \
-       (REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_FLUSH | REQ_FUA)
+       (REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_FLUSH | REQ_FUA | REQ_DISCARD)
  #define rq_mergeable(rq)       \
         (!((rq)->cmd_flags & RQ_NOMERGE_FLAGS) && \
          (((rq)->cmd_flags & REQ_DISCARD) || \
@@ -894,6 +894,8 @@ extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable);
  extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
  
  extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *);
+extern int blk_bio_map_sg(struct request_queue *q, struct bio *bio,
+                         struct scatterlist *sglist);
  extern void blk_dump_rq_flags(struct request *, char *);
  extern long nr_blockdev_pages(void);
  
@@ -1139,6 +1141,16 @@ static inline int queue_limit_discard_alignment(struct queue_limits *lim, sector
                 & (lim->discard_granularity - 1);
  }
  
+static inline int bdev_discard_alignment(struct block_device *bdev)
+{
+       struct request_queue *q = bdev_get_queue(bdev);
+
+       if (bdev != bdev->bd_contains)
+               return bdev->bd_part->discard_alignment;
+
+       return q->limits.discard_alignment;
+}
+
  static inline unsigned int queue_discard_zeroes_data(struct request_queue *q)
  {
         if (q->limits.max_discard_sectors && q->limits.discard_zeroes_data == 1)
diff --git a/mm/filemap.c b/mm/filemap.c

index fa5ca30..3843445 100644 (file)
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1412,12 +1412,8 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
                         retval = filemap_write_and_wait_range(mapping, pos,
                                         pos + iov_length(iov, nr_segs) - 1);
                         if (!retval) {
-                               struct blk_plug plug;
-
-                               blk_start_plug(&plug);
                                 retval = mapping->a_ops->direct_IO(READ, iocb,
                                                         iov, pos, nr_segs);
-                               blk_finish_plug(&plug);
                         }
                         if (retval > 0) {
                                 *ppos = pos + retval;
@@ -2527,14 +2523,12 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
  {
         struct file *file = iocb->ki_filp;
         struct inode *inode = file->f_mapping->host;
-       struct blk_plug plug;
         ssize_t ret;
  
         BUG_ON(iocb->ki_pos != pos);
  
         sb_start_write(inode->i_sb);
         mutex_lock(&inode->i_mutex);
-       blk_start_plug(&plug);
         ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
         mutex_unlock(&inode->i_mutex);
  
@@ -2545,7 +2539,6 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
                 if (err < 0 && ret > 0)
                         ret = err;
         }
-       blk_finish_plug(&plug);
         sb_end_write(inode->i_sb);
         return ret;
  }
author	Linus Torvalds <torvalds@linux-foundation.org>
	Sat, 25 Aug 2012 18:36:43 +0000 (11:36 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Sat, 25 Aug 2012 18:36:43 +0000 (11:36 -0700)
Documentation/block/00-INDEX		patch \| blob \| history
Documentation/block/cfq-iosched.txt		patch \| blob \| history
Documentation/block/queue-sysfs.txt		patch \| blob \| history
block/blk-lib.c		patch \| blob \| history
block/blk-merge.c		patch \| blob \| history
block/genhd.c		patch \| blob \| history
drivers/block/drbd/drbd_bitmap.c		patch \| blob \| history
drivers/block/drbd/drbd_int.h		patch \| blob \| history
drivers/block/drbd/drbd_main.c		patch \| blob \| history
drivers/block/drbd/drbd_nl.c		patch \| blob \| history
drivers/block/drbd/drbd_req.c		patch \| blob \| history
fs/bio.c		patch \| blob \| history
fs/block_dev.c		patch \| blob \| history
fs/buffer.c		patch \| blob \| history
fs/direct-io.c		patch \| blob \| history
include/linux/blkdev.h		patch \| blob \| history
mm/filemap.c		patch \| blob \| history