OSDN Git Service

Merge branch 'bcache-for-3.13' of git://evilpiepirate.org/~kent/linux-bcache into...
authorJens Axboe <axboe@kernel.dk>
Tue, 17 Dec 2013 19:54:03 +0000 (12:54 -0700)
committerJens Axboe <axboe@kernel.dk>
Tue, 17 Dec 2013 19:54:03 +0000 (12:54 -0700)
Kent writes:

Jens - small pile of bcache fixes. I've been slacking on the writeback
fixes but those definitely need to get into 3.13.

drivers/md/bcache/alloc.c
drivers/md/bcache/bcache.h
drivers/md/bcache/btree.c
drivers/md/bcache/movinggc.c
drivers/md/bcache/super.c
drivers/md/bcache/sysfs.c
drivers/md/bcache/util.c
drivers/md/bcache/util.h
drivers/md/bcache/writeback.c

index 2b46bf1..4c9852d 100644 (file)
@@ -421,9 +421,11 @@ out:
 
        if (watermark <= WATERMARK_METADATA) {
                SET_GC_MARK(b, GC_MARK_METADATA);
+               SET_GC_MOVE(b, 0);
                b->prio = BTREE_PRIO;
        } else {
                SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
+               SET_GC_MOVE(b, 0);
                b->prio = INITIAL_PRIO;
        }
 
index 4beb55a..754f431 100644 (file)
@@ -197,7 +197,7 @@ struct bucket {
        uint8_t         disk_gen;
        uint8_t         last_gc; /* Most out of date gen in the btree */
        uint8_t         gc_gen;
-       uint16_t        gc_mark;
+       uint16_t        gc_mark; /* Bitfield used by GC. See below for field */
 };
 
 /*
@@ -209,7 +209,8 @@ BITMASK(GC_MARK,     struct bucket, gc_mark, 0, 2);
 #define GC_MARK_RECLAIMABLE    0
 #define GC_MARK_DIRTY          1
 #define GC_MARK_METADATA       2
-BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, 14);
+BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, 13);
+BITMASK(GC_MOVE, struct bucket, gc_mark, 15, 1);
 
 #include "journal.h"
 #include "stats.h"
@@ -372,14 +373,14 @@ struct cached_dev {
        unsigned char           writeback_percent;
        unsigned                writeback_delay;
 
-       int                     writeback_rate_change;
-       int64_t                 writeback_rate_derivative;
        uint64_t                writeback_rate_target;
+       int64_t                 writeback_rate_proportional;
+       int64_t                 writeback_rate_derivative;
+       int64_t                 writeback_rate_change;
 
        unsigned                writeback_rate_update_seconds;
        unsigned                writeback_rate_d_term;
        unsigned                writeback_rate_p_term_inverse;
-       unsigned                writeback_rate_d_smooth;
 };
 
 enum alloc_watermarks {
@@ -445,7 +446,6 @@ struct cache {
         * call prio_write() to keep gens from wrapping.
         */
        uint8_t                 need_save_prio;
-       unsigned                gc_move_threshold;
 
        /*
         * If nonzero, we know we aren't going to find any buckets to invalidate
index 5e2765a..31bb53f 100644 (file)
@@ -1561,6 +1561,28 @@ size_t bch_btree_gc_finish(struct cache_set *c)
                SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i),
                            GC_MARK_METADATA);
 
+       /* don't reclaim buckets to which writeback keys point */
+       rcu_read_lock();
+       for (i = 0; i < c->nr_uuids; i++) {
+               struct bcache_device *d = c->devices[i];
+               struct cached_dev *dc;
+               struct keybuf_key *w, *n;
+               unsigned j;
+
+               if (!d || UUID_FLASH_ONLY(&c->uuids[i]))
+                       continue;
+               dc = container_of(d, struct cached_dev, disk);
+
+               spin_lock(&dc->writeback_keys.lock);
+               rbtree_postorder_for_each_entry_safe(w, n,
+                                       &dc->writeback_keys.keys, node)
+                       for (j = 0; j < KEY_PTRS(&w->key); j++)
+                               SET_GC_MARK(PTR_BUCKET(c, &w->key, j),
+                                           GC_MARK_DIRTY);
+               spin_unlock(&dc->writeback_keys.lock);
+       }
+       rcu_read_unlock();
+
        for_each_cache(ca, c, i) {
                uint64_t *i;
 
@@ -1817,7 +1839,8 @@ static bool fix_overlapping_extents(struct btree *b, struct bkey *insert,
                        if (KEY_START(k) > KEY_START(insert) + sectors_found)
                                goto check_failed;
 
-                       if (KEY_PTRS(replace_key) != KEY_PTRS(k))
+                       if (KEY_PTRS(k) != KEY_PTRS(replace_key) ||
+                           KEY_DIRTY(k) != KEY_DIRTY(replace_key))
                                goto check_failed;
 
                        /* skip past gen */
@@ -2217,7 +2240,7 @@ struct btree_insert_op {
        struct bkey     *replace_key;
 };
 
-int btree_insert_fn(struct btree_op *b_op, struct btree *b)
+static int btree_insert_fn(struct btree_op *b_op, struct btree *b)
 {
        struct btree_insert_op *op = container_of(b_op,
                                        struct btree_insert_op, op);
index 7c1275e..f2f0998 100644 (file)
@@ -25,10 +25,9 @@ static bool moving_pred(struct keybuf *buf, struct bkey *k)
        unsigned i;
 
        for (i = 0; i < KEY_PTRS(k); i++) {
-               struct cache *ca = PTR_CACHE(c, k, i);
                struct bucket *g = PTR_BUCKET(c, k, i);
 
-               if (GC_SECTORS_USED(g) < ca->gc_move_threshold)
+               if (GC_MOVE(g))
                        return true;
        }
 
@@ -65,11 +64,16 @@ static void write_moving_finish(struct closure *cl)
 
 static void read_moving_endio(struct bio *bio, int error)
 {
+       struct bbio *b = container_of(bio, struct bbio, bio);
        struct moving_io *io = container_of(bio->bi_private,
                                            struct moving_io, cl);
 
        if (error)
                io->op.error = error;
+       else if (!KEY_DIRTY(&b->key) &&
+                ptr_stale(io->op.c, &b->key, 0)) {
+               io->op.error = -EINTR;
+       }
 
        bch_bbio_endio(io->op.c, bio, error, "reading data to move");
 }
@@ -141,6 +145,11 @@ static void read_moving(struct cache_set *c)
                if (!w)
                        break;
 
+               if (ptr_stale(c, &w->key, 0)) {
+                       bch_keybuf_del(&c->moving_gc_keys, w);
+                       continue;
+               }
+
                io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec)
                             * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
                             GFP_KERNEL);
@@ -184,7 +193,8 @@ static bool bucket_cmp(struct bucket *l, struct bucket *r)
 
 static unsigned bucket_heap_top(struct cache *ca)
 {
-       return GC_SECTORS_USED(heap_peek(&ca->heap));
+       struct bucket *b;
+       return (b = heap_peek(&ca->heap)) ? GC_SECTORS_USED(b) : 0;
 }
 
 void bch_moving_gc(struct cache_set *c)
@@ -226,9 +236,8 @@ void bch_moving_gc(struct cache_set *c)
                        sectors_to_move -= GC_SECTORS_USED(b);
                }
 
-               ca->gc_move_threshold = bucket_heap_top(ca);
-
-               pr_debug("threshold %u", ca->gc_move_threshold);
+               while (heap_pop(&ca->heap, b, bucket_cmp))
+                       SET_GC_MOVE(b, 1);
        }
 
        mutex_unlock(&c->bucket_lock);
index dec15cd..c57bfa0 100644 (file)
@@ -1676,7 +1676,7 @@ err:
 static bool can_attach_cache(struct cache *ca, struct cache_set *c)
 {
        return ca->sb.block_size        == c->sb.block_size &&
-               ca->sb.bucket_size      == c->sb.block_size &&
+               ca->sb.bucket_size      == c->sb.bucket_size &&
                ca->sb.nr_in_set        == c->sb.nr_in_set;
 }
 
index 80d4c2b..a1f8561 100644 (file)
@@ -83,7 +83,6 @@ rw_attribute(writeback_rate);
 rw_attribute(writeback_rate_update_seconds);
 rw_attribute(writeback_rate_d_term);
 rw_attribute(writeback_rate_p_term_inverse);
-rw_attribute(writeback_rate_d_smooth);
 read_attribute(writeback_rate_debug);
 
 read_attribute(stripe_size);
@@ -129,31 +128,41 @@ SHOW(__bch_cached_dev)
        var_printf(writeback_running,   "%i");
        var_print(writeback_delay);
        var_print(writeback_percent);
-       sysfs_print(writeback_rate,     dc->writeback_rate.rate);
+       sysfs_hprint(writeback_rate,    dc->writeback_rate.rate << 9);
 
        var_print(writeback_rate_update_seconds);
        var_print(writeback_rate_d_term);
        var_print(writeback_rate_p_term_inverse);
-       var_print(writeback_rate_d_smooth);
 
        if (attr == &sysfs_writeback_rate_debug) {
+               char rate[20];
                char dirty[20];
-               char derivative[20];
                char target[20];
-               bch_hprint(dirty,
-                          bcache_dev_sectors_dirty(&dc->disk) << 9);
-               bch_hprint(derivative,  dc->writeback_rate_derivative << 9);
+               char proportional[20];
+               char derivative[20];
+               char change[20];
+               s64 next_io;
+
+               bch_hprint(rate,        dc->writeback_rate.rate << 9);
+               bch_hprint(dirty,       bcache_dev_sectors_dirty(&dc->disk) << 9);
                bch_hprint(target,      dc->writeback_rate_target << 9);
+               bch_hprint(proportional,dc->writeback_rate_proportional << 9);
+               bch_hprint(derivative,  dc->writeback_rate_derivative << 9);
+               bch_hprint(change,      dc->writeback_rate_change << 9);
+
+               next_io = div64_s64(dc->writeback_rate.next - local_clock(),
+                                   NSEC_PER_MSEC);
 
                return sprintf(buf,
-                              "rate:\t\t%u\n"
-                              "change:\t\t%i\n"
+                              "rate:\t\t%s/sec\n"
                               "dirty:\t\t%s\n"
+                              "target:\t\t%s\n"
+                              "proportional:\t%s\n"
                               "derivative:\t%s\n"
-                              "target:\t\t%s\n",
-                              dc->writeback_rate.rate,
-                              dc->writeback_rate_change,
-                              dirty, derivative, target);
+                              "change:\t\t%s/sec\n"
+                              "next io:\t%llims\n",
+                              rate, dirty, target, proportional,
+                              derivative, change, next_io);
        }
 
        sysfs_hprint(dirty_data,
@@ -189,6 +198,7 @@ STORE(__cached_dev)
        struct kobj_uevent_env *env;
 
 #define d_strtoul(var)         sysfs_strtoul(var, dc->var)
+#define d_strtoul_nonzero(var) sysfs_strtoul_clamp(var, dc->var, 1, INT_MAX)
 #define d_strtoi_h(var)                sysfs_hatoi(var, dc->var)
 
        sysfs_strtoul(data_csum,        dc->disk.data_csum);
@@ -197,16 +207,15 @@ STORE(__cached_dev)
        d_strtoul(writeback_metadata);
        d_strtoul(writeback_running);
        d_strtoul(writeback_delay);
-       sysfs_strtoul_clamp(writeback_rate,
-                           dc->writeback_rate.rate, 1, 1000000);
+
        sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40);
 
-       d_strtoul(writeback_rate_update_seconds);
+       sysfs_strtoul_clamp(writeback_rate,
+                           dc->writeback_rate.rate, 1, INT_MAX);
+
+       d_strtoul_nonzero(writeback_rate_update_seconds);
        d_strtoul(writeback_rate_d_term);
-       d_strtoul(writeback_rate_p_term_inverse);
-       sysfs_strtoul_clamp(writeback_rate_p_term_inverse,
-                           dc->writeback_rate_p_term_inverse, 1, INT_MAX);
-       d_strtoul(writeback_rate_d_smooth);
+       d_strtoul_nonzero(writeback_rate_p_term_inverse);
 
        d_strtoi_h(sequential_cutoff);
        d_strtoi_h(readahead);
@@ -313,7 +322,6 @@ static struct attribute *bch_cached_dev_files[] = {
        &sysfs_writeback_rate_update_seconds,
        &sysfs_writeback_rate_d_term,
        &sysfs_writeback_rate_p_term_inverse,
-       &sysfs_writeback_rate_d_smooth,
        &sysfs_writeback_rate_debug,
        &sysfs_dirty_data,
        &sysfs_stripe_size,
index 462214e..bb37618 100644 (file)
@@ -209,7 +209,13 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
 {
        uint64_t now = local_clock();
 
-       d->next += div_u64(done, d->rate);
+       d->next += div_u64(done * NSEC_PER_SEC, d->rate);
+
+       if (time_before64(now + NSEC_PER_SEC, d->next))
+               d->next = now + NSEC_PER_SEC;
+
+       if (time_after64(now - NSEC_PER_SEC * 2, d->next))
+               d->next = now - NSEC_PER_SEC * 2;
 
        return time_after64(d->next, now)
                ? div_u64(d->next - now, NSEC_PER_SEC / HZ)
index 362c4b3..1030c60 100644 (file)
@@ -110,7 +110,7 @@ do {                                                                        \
        _r;                                                             \
 })
 
-#define heap_peek(h)   ((h)->size ? (h)->data[0] : NULL)
+#define heap_peek(h)   ((h)->used ? (h)->data[0] : NULL)
 
 #define heap_full(h)   ((h)->used == (h)->size)
 
index 99053b1..6c44fe0 100644 (file)
@@ -30,38 +30,40 @@ static void __update_writeback_rate(struct cached_dev *dc)
 
        /* PD controller */
 
-       int change = 0;
-       int64_t error;
        int64_t dirty = bcache_dev_sectors_dirty(&dc->disk);
        int64_t derivative = dirty - dc->disk.sectors_dirty_last;
+       int64_t proportional = dirty - target;
+       int64_t change;
 
        dc->disk.sectors_dirty_last = dirty;
 
-       derivative *= dc->writeback_rate_d_term;
-       derivative = clamp(derivative, -dirty, dirty);
+       /* Scale to sectors per second */
 
-       derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative,
-                             dc->writeback_rate_d_smooth, 0);
+       proportional *= dc->writeback_rate_update_seconds;
+       proportional = div_s64(proportional, dc->writeback_rate_p_term_inverse);
 
-       /* Avoid divide by zero */
-       if (!target)
-               goto out;
+       derivative = div_s64(derivative, dc->writeback_rate_update_seconds);
 
-       error = div64_s64((dirty + derivative - target) << 8, target);
+       derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative,
+                             (dc->writeback_rate_d_term /
+                              dc->writeback_rate_update_seconds) ?: 1, 0);
+
+       derivative *= dc->writeback_rate_d_term;
+       derivative = div_s64(derivative, dc->writeback_rate_p_term_inverse);
 
-       change = div_s64((dc->writeback_rate.rate * error) >> 8,
-                        dc->writeback_rate_p_term_inverse);
+       change = proportional + derivative;
 
        /* Don't increase writeback rate if the device isn't keeping up */
        if (change > 0 &&
            time_after64(local_clock(),
-                        dc->writeback_rate.next + 10 * NSEC_PER_MSEC))
+                        dc->writeback_rate.next + NSEC_PER_MSEC))
                change = 0;
 
        dc->writeback_rate.rate =
-               clamp_t(int64_t, dc->writeback_rate.rate + change,
+               clamp_t(int64_t, (int64_t) dc->writeback_rate.rate + change,
                        1, NSEC_PER_MSEC);
-out:
+
+       dc->writeback_rate_proportional = proportional;
        dc->writeback_rate_derivative = derivative;
        dc->writeback_rate_change = change;
        dc->writeback_rate_target = target;
@@ -87,15 +89,11 @@ static void update_writeback_rate(struct work_struct *work)
 
 static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
 {
-       uint64_t ret;
-
        if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
            !dc->writeback_percent)
                return 0;
 
-       ret = bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL);
-
-       return min_t(uint64_t, ret, HZ);
+       return bch_next_delay(&dc->writeback_rate, sectors);
 }
 
 struct dirty_io {
@@ -241,7 +239,7 @@ static void read_dirty(struct cached_dev *dc)
                if (KEY_START(&w->key) != dc->last_read ||
                    jiffies_to_msecs(delay) > 50)
                        while (!kthread_should_stop() && delay)
-                               delay = schedule_timeout_interruptible(delay);
+                               delay = schedule_timeout_uninterruptible(delay);
 
                dc->last_read   = KEY_OFFSET(&w->key);
 
@@ -438,7 +436,7 @@ static int bch_writeback_thread(void *arg)
                        while (delay &&
                               !kthread_should_stop() &&
                               !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
-                               delay = schedule_timeout_interruptible(delay);
+                               delay = schedule_timeout_uninterruptible(delay);
                }
        }
 
@@ -476,6 +474,8 @@ void bch_sectors_dirty_init(struct cached_dev *dc)
 
        bch_btree_map_keys(&op.op, dc->disk.c, &KEY(op.inode, 0, 0),
                           sectors_dirty_init_fn, 0);
+
+       dc->disk.sectors_dirty_last = bcache_dev_sectors_dirty(&dc->disk);
 }
 
 int bch_cached_dev_writeback_init(struct cached_dev *dc)
@@ -490,18 +490,15 @@ int bch_cached_dev_writeback_init(struct cached_dev *dc)
        dc->writeback_delay             = 30;
        dc->writeback_rate.rate         = 1024;
 
-       dc->writeback_rate_update_seconds = 30;
-       dc->writeback_rate_d_term       = 16;
-       dc->writeback_rate_p_term_inverse = 64;
-       dc->writeback_rate_d_smooth     = 8;
+       dc->writeback_rate_update_seconds = 5;
+       dc->writeback_rate_d_term       = 30;
+       dc->writeback_rate_p_term_inverse = 6000;
 
        dc->writeback_thread = kthread_create(bch_writeback_thread, dc,
                                              "bcache_writeback");
        if (IS_ERR(dc->writeback_thread))
                return PTR_ERR(dc->writeback_thread);
 
-       set_task_state(dc->writeback_thread, TASK_INTERRUPTIBLE);
-
        INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
        schedule_delayed_work(&dc->writeback_rate_update,
                              dc->writeback_rate_update_seconds * HZ);