From 6f428df47dae2c8ea31fd4c0c74a12a8a5ac2d1d Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 21 Jun 2017 17:27:18 +0200 Subject: [PATCH] libceph: pg_upmap[_items] infrastructure pg_temp and pg_upmap encodings are the same (PG -> array of osds), except for the incremental remove: it's an empty mapping in new_pg_temp for pg_temp and a separate old_pg_upmap set for pg_upmap. (This isn't to allow for empty pg_upmap mappings -- apparently, pg_temp just wasn't looked at as an example for pg_upmap encoding.) Reuse __decode_pg_temp() for decoding pg_upmap and new_pg_upmap. __decode_pg_temp() stores into pg_temp union member, but since pg_upmap union member is identical, reading through pg_upmap later is OK. Signed-off-by: Ilya Dryomov --- include/linux/ceph/osdmap.h | 10 +++- net/ceph/debugfs.c | 23 ++++++++ net/ceph/osdmap.c | 135 +++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 164 insertions(+), 4 deletions(-) diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index fe6d189bdd30..c612cff81f5c 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -143,10 +143,14 @@ struct ceph_pg_mapping { struct { int len; int osds[]; - } pg_temp; + } pg_temp, pg_upmap; struct { int osd; } primary_temp; + struct { + int len; + int from_to[][2]; + } pg_upmap_items; }; }; @@ -165,6 +169,10 @@ struct ceph_osdmap { struct rb_root pg_temp; struct rb_root primary_temp; + /* remap (post-CRUSH, pre-up) */ + struct rb_root pg_upmap; /* PG := raw set */ + struct rb_root pg_upmap_items; /* from -> to within raw set */ + u32 *osd_primary_affinity; struct rb_root pg_pools; diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index 017f15c575f8..4f57d5bcaba2 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -104,6 +104,29 @@ static int osdmap_show(struct seq_file *s, void *p) seq_printf(s, "primary_temp %llu.%x %d\n", pg->pgid.pool, pg->pgid.seed, pg->primary_temp.osd); } + for (n = rb_first(&map->pg_upmap); n; n = rb_next(n)) { + struct ceph_pg_mapping *pg = + rb_entry(n, struct ceph_pg_mapping, node); + + seq_printf(s, "pg_upmap %llu.%x [", pg->pgid.pool, + pg->pgid.seed); + for (i = 0; i < pg->pg_upmap.len; i++) + seq_printf(s, "%s%d", (i == 0 ? "" : ","), + pg->pg_upmap.osds[i]); + seq_printf(s, "]\n"); + } + for (n = rb_first(&map->pg_upmap_items); n; n = rb_next(n)) { + struct ceph_pg_mapping *pg = + rb_entry(n, struct ceph_pg_mapping, node); + + seq_printf(s, "pg_upmap_items %llu.%x [", pg->pgid.pool, + pg->pgid.seed); + for (i = 0; i < pg->pg_upmap_items.len; i++) + seq_printf(s, "%s%d->%d", (i == 0 ? "" : ","), + pg->pg_upmap_items.from_to[i][0], + pg->pg_upmap_items.from_to[i][1]); + seq_printf(s, "]\n"); + } up_read(&osdc->lock); return 0; diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index f6d561edd511..a3f60d0bfd13 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -735,6 +735,8 @@ struct ceph_osdmap *ceph_osdmap_alloc(void) map->pool_max = -1; map->pg_temp = RB_ROOT; map->primary_temp = RB_ROOT; + map->pg_upmap = RB_ROOT; + map->pg_upmap_items = RB_ROOT; mutex_init(&map->crush_workspace_mutex); return map; @@ -759,6 +761,20 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map) erase_pg_mapping(&map->primary_temp, pg); free_pg_mapping(pg); } + while (!RB_EMPTY_ROOT(&map->pg_upmap)) { + struct ceph_pg_mapping *pg = + rb_entry(rb_first(&map->pg_upmap), + struct ceph_pg_mapping, node); + rb_erase(&pg->node, &map->pg_upmap); + kfree(pg); + } + while (!RB_EMPTY_ROOT(&map->pg_upmap_items)) { + struct ceph_pg_mapping *pg = + rb_entry(rb_first(&map->pg_upmap_items), + struct ceph_pg_mapping, node); + rb_erase(&pg->node, &map->pg_upmap_items); + kfree(pg); + } while (!RB_EMPTY_ROOT(&map->pg_pools)) { struct ceph_pg_pool_info *pi = rb_entry(rb_first(&map->pg_pools), @@ -1161,6 +1177,75 @@ e_inval: return -EINVAL; } +static struct ceph_pg_mapping *__decode_pg_upmap(void **p, void *end, + bool __unused) +{ + return __decode_pg_temp(p, end, false); +} + +static int decode_pg_upmap(void **p, void *end, struct ceph_osdmap *map) +{ + return decode_pg_mapping(p, end, &map->pg_upmap, __decode_pg_upmap, + false); +} + +static int decode_new_pg_upmap(void **p, void *end, struct ceph_osdmap *map) +{ + return decode_pg_mapping(p, end, &map->pg_upmap, __decode_pg_upmap, + true); +} + +static int decode_old_pg_upmap(void **p, void *end, struct ceph_osdmap *map) +{ + return decode_pg_mapping(p, end, &map->pg_upmap, NULL, true); +} + +static struct ceph_pg_mapping *__decode_pg_upmap_items(void **p, void *end, + bool __unused) +{ + struct ceph_pg_mapping *pg; + u32 len, i; + + ceph_decode_32_safe(p, end, len, e_inval); + if (len > (SIZE_MAX - sizeof(*pg)) / (2 * sizeof(u32))) + return ERR_PTR(-EINVAL); + + ceph_decode_need(p, end, 2 * len * sizeof(u32), e_inval); + pg = kzalloc(sizeof(*pg) + 2 * len * sizeof(u32), GFP_NOIO); + if (!pg) + return ERR_PTR(-ENOMEM); + + pg->pg_upmap_items.len = len; + for (i = 0; i < len; i++) { + pg->pg_upmap_items.from_to[i][0] = ceph_decode_32(p); + pg->pg_upmap_items.from_to[i][1] = ceph_decode_32(p); + } + + return pg; + +e_inval: + return ERR_PTR(-EINVAL); +} + +static int decode_pg_upmap_items(void **p, void *end, struct ceph_osdmap *map) +{ + return decode_pg_mapping(p, end, &map->pg_upmap_items, + __decode_pg_upmap_items, false); +} + +static int decode_new_pg_upmap_items(void **p, void *end, + struct ceph_osdmap *map) +{ + return decode_pg_mapping(p, end, &map->pg_upmap_items, + __decode_pg_upmap_items, true); +} + +static int decode_old_pg_upmap_items(void **p, void *end, + struct ceph_osdmap *map) +{ + return decode_pg_mapping(p, end, &map->pg_upmap_items, NULL, true); +} + /* * decode a full map. */ @@ -1250,9 +1335,7 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map) if (err) goto bad; } else { - /* XXX can this happen? */ - kfree(map->osd_primary_affinity); - map->osd_primary_affinity = NULL; + WARN_ON(map->osd_primary_affinity); } /* crush */ @@ -1261,6 +1344,26 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map) if (err) goto bad; + *p += len; + if (struct_v >= 3) { + /* erasure_code_profiles */ + ceph_decode_skip_map_of_map(p, end, string, string, string, + bad); + } + + if (struct_v >= 4) { + err = decode_pg_upmap(p, end, map); + if (err) + goto bad; + + err = decode_pg_upmap_items(p, end, map); + if (err) + goto bad; + } else { + WARN_ON(!RB_EMPTY_ROOT(&map->pg_upmap)); + WARN_ON(!RB_EMPTY_ROOT(&map->pg_upmap_items)); + } + /* ignore the rest */ *p = end; @@ -1520,6 +1623,32 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, goto bad; } + if (struct_v >= 3) { + /* new_erasure_code_profiles */ + ceph_decode_skip_map_of_map(p, end, string, string, string, + bad); + /* old_erasure_code_profiles */ + ceph_decode_skip_set(p, end, string, bad); + } + + if (struct_v >= 4) { + err = decode_new_pg_upmap(p, end, map); + if (err) + goto bad; + + err = decode_old_pg_upmap(p, end, map); + if (err) + goto bad; + + err = decode_new_pg_upmap_items(p, end, map); + if (err) + goto bad; + + err = decode_old_pg_upmap_items(p, end, map); + if (err) + goto bad; + } + /* ignore the rest */ *p = end; -- 2.11.0