bpf: add lookup/update support for per-cpu hash and array maps

author Alexei Starovoitov <ast@fb.com>

Tue, 2 Feb 2016 06:39:55 +0000 (22:39 -0800)

committer David S. Miller <davem@davemloft.net>

Sat, 6 Feb 2016 08:34:36 +0000 (03:34 -0500)
author Alexei Starovoitov <ast@fb.com>
Tue, 2 Feb 2016 06:39:55 +0000 (22:39 -0800)
committer David S. Miller <davem@davemloft.net>
Sat, 6 Feb 2016 08:34:36 +0000 (03:34 -0500)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h

index 141fb0d..90ee6ab 100644 (file)
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -183,6 +183,29 @@ int bpf_prog_new_fd(struct bpf_prog *prog);
  int bpf_obj_pin_user(u32 ufd, const char __user *pathname);
  int bpf_obj_get_user(const char __user *pathname);
  
+int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value);
+int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value);
+int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
+                          u64 flags);
+int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
+                           u64 flags);
+
+/* memcpy that is used with 8-byte aligned pointers, power-of-8 size and
+ * forced to use 'long' read/writes to try to atomically copy long counters.
+ * Best-effort only.  No barriers here, since it _will_ race with concurrent
+ * updates from BPF programs. Called from bpf syscall and mostly used with
+ * size 8 or 16 bytes, so ask compiler to inline it.
+ */
+static inline void bpf_long_memcpy(void *dst, const void *src, u32 size)
+{
+       const long *lsrc = src;
+       long *ldst = dst;
+
+       size /= sizeof(long);
+       while (size--)
+               *ldst++ = *lsrc++;
+}
+
  /* verify correctness of eBPF program */
  int bpf_check(struct bpf_prog **fp, union bpf_attr *attr);
  #else
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c

index b9bf1d7..bd3bdf2 100644 (file)
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -130,6 +130,32 @@ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key)
         return this_cpu_ptr(array->pptrs[index]);
  }
  
+int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
+{
+       struct bpf_array *array = container_of(map, struct bpf_array, map);
+       u32 index = *(u32 *)key;
+       void __percpu *pptr;
+       int cpu, off = 0;
+       u32 size;
+
+       if (unlikely(index >= array->map.max_entries))
+               return -ENOENT;
+
+       /* per_cpu areas are zero-filled and bpf programs can only
+        * access 'value_size' of them, so copying rounded areas
+        * will not leak any kernel data
+        */
+       size = round_up(map->value_size, 8);
+       rcu_read_lock();
+       pptr = array->pptrs[index];
+       for_each_possible_cpu(cpu) {
+               bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size);
+               off += size;
+       }
+       rcu_read_unlock();
+       return 0;
+}
+
  /* Called from syscall */
  static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
  {
@@ -177,6 +203,44 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
         return 0;
  }
  
+int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
+                           u64 map_flags)
+{
+       struct bpf_array *array = container_of(map, struct bpf_array, map);
+       u32 index = *(u32 *)key;
+       void __percpu *pptr;
+       int cpu, off = 0;
+       u32 size;
+
+       if (unlikely(map_flags > BPF_EXIST))
+               /* unknown flags */
+               return -EINVAL;
+
+       if (unlikely(index >= array->map.max_entries))
+               /* all elements were pre-allocated, cannot insert a new one */
+               return -E2BIG;
+
+       if (unlikely(map_flags == BPF_NOEXIST))
+               /* all elements already exist */
+               return -EEXIST;
+
+       /* the user space will provide round_up(value_size, 8) bytes that
+        * will be copied into per-cpu area. bpf programs can only access
+        * value_size of it. During lookup the same extra bytes will be
+        * returned or zeros which were zero-filled by percpu_alloc,
+        * so no kernel data leaks possible
+        */
+       size = round_up(map->value_size, 8);
+       rcu_read_lock();
+       pptr = array->pptrs[index];
+       for_each_possible_cpu(cpu) {
+               bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size);
+               off += size;
+       }
+       rcu_read_unlock();
+       return 0;
+}
+
  /* Called from syscall or from eBPF program */
  static int array_map_delete_elem(struct bpf_map *map, void *key)
  {
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c

index 2be5f6e..fd5db8f 100644 (file)
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -290,7 +290,7 @@ static void free_htab_elem(struct htab_elem *l, bool percpu, u32 key_size)
  
  static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
                                          void *value, u32 key_size, u32 hash,
-                                        bool percpu)
+                                        bool percpu, bool onallcpus)
  {
         u32 size = htab->map.value_size;
         struct htab_elem *l_new;
@@ -312,8 +312,18 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
                         return NULL;
                 }
  
-               /* copy true value_size bytes */
-               memcpy(this_cpu_ptr(pptr), value, htab->map.value_size);
+               if (!onallcpus) {
+                       /* copy true value_size bytes */
+                       memcpy(this_cpu_ptr(pptr), value, htab->map.value_size);
+               } else {
+                       int off = 0, cpu;
+
+                       for_each_possible_cpu(cpu) {
+                               bpf_long_memcpy(per_cpu_ptr(pptr, cpu),
+                                               value + off, size);
+                               off += size;
+                       }
+               }
                 htab_elem_set_ptr(l_new, key_size, pptr);
         } else {
                 memcpy(l_new->key + round_up(key_size, 8), value, size);
@@ -368,7 +378,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
         /* allocate new element outside of the lock, since
          * we're most likley going to insert it
          */
-       l_new = alloc_htab_elem(htab, key, value, key_size, hash, false);
+       l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false);
         if (!l_new)
                 return -ENOMEM;
  
@@ -402,8 +412,9 @@ err:
         return ret;
  }
  
-static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
-                                      void *value, u64 map_flags)
+static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
+                                        void *value, u64 map_flags,
+                                        bool onallcpus)
  {
         struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
         struct htab_elem *l_new = NULL, *l_old;
@@ -436,12 +447,25 @@ static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
                 goto err;
  
         if (l_old) {
+               void __percpu *pptr = htab_elem_get_ptr(l_old, key_size);
+               u32 size = htab->map.value_size;
+
                 /* per-cpu hash map can update value in-place */
-               memcpy(this_cpu_ptr(htab_elem_get_ptr(l_old, key_size)),
-                      value, htab->map.value_size);
+               if (!onallcpus) {
+                       memcpy(this_cpu_ptr(pptr), value, size);
+               } else {
+                       int off = 0, cpu;
+
+                       size = round_up(size, 8);
+                       for_each_possible_cpu(cpu) {
+                               bpf_long_memcpy(per_cpu_ptr(pptr, cpu),
+                                               value + off, size);
+                               off += size;
+                       }
+               }
         } else {
                 l_new = alloc_htab_elem(htab, key, value, key_size,
-                                       hash, true);
+                                       hash, true, onallcpus);
                 if (!l_new) {
                         ret = -ENOMEM;
                         goto err;
@@ -455,6 +479,12 @@ err:
         return ret;
  }
  
+static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
+                                      void *value, u64 map_flags)
+{
+       return __htab_percpu_map_update_elem(map, key, value, map_flags, false);
+}
+
  /* Called from syscall or from eBPF program */
  static int htab_map_delete_elem(struct bpf_map *map, void *key)
  {
@@ -557,6 +587,41 @@ static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
                 return NULL;
  }
  
+int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
+{
+       struct htab_elem *l;
+       void __percpu *pptr;
+       int ret = -ENOENT;
+       int cpu, off = 0;
+       u32 size;
+
+       /* per_cpu areas are zero-filled and bpf programs can only
+        * access 'value_size' of them, so copying rounded areas
+        * will not leak any kernel data
+        */
+       size = round_up(map->value_size, 8);
+       rcu_read_lock();
+       l = __htab_map_lookup_elem(map, key);
+       if (!l)
+               goto out;
+       pptr = htab_elem_get_ptr(l, map->key_size);
+       for_each_possible_cpu(cpu) {
+               bpf_long_memcpy(value + off,
+                               per_cpu_ptr(pptr, cpu), size);
+               off += size;
+       }
+       ret = 0;
+out:
+       rcu_read_unlock();
+       return ret;
+}
+
+int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
+                          u64 map_flags)
+{
+       return __htab_percpu_map_update_elem(map, key, value, map_flags, true);
+}
+
  static const struct bpf_map_ops htab_percpu_ops = {
         .map_alloc = htab_map_alloc,
         .map_free = htab_map_free,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c

index 6373970..c95a753 100644 (file)
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -239,6 +239,7 @@ static int map_lookup_elem(union bpf_attr *attr)
         int ufd = attr->map_fd;
         struct bpf_map *map;
         void *key, *value, *ptr;
+       u32 value_size;
         struct fd f;
         int err;
  
@@ -259,23 +260,35 @@ static int map_lookup_elem(union bpf_attr *attr)
         if (copy_from_user(key, ukey, map->key_size) != 0)
                 goto free_key;
  
+       if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+           map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
+               value_size = round_up(map->value_size, 8) * num_possible_cpus();
+       else
+               value_size = map->value_size;
+
         err = -ENOMEM;
-       value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN);
+       value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
         if (!value)
                 goto free_key;
  
-       rcu_read_lock();
-       ptr = map->ops->map_lookup_elem(map, key);
-       if (ptr)
-               memcpy(value, ptr, map->value_size);
-       rcu_read_unlock();
+       if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) {
+               err = bpf_percpu_hash_copy(map, key, value);
+       } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
+               err = bpf_percpu_array_copy(map, key, value);
+       } else {
+               rcu_read_lock();
+               ptr = map->ops->map_lookup_elem(map, key);
+               if (ptr)
+                       memcpy(value, ptr, value_size);
+               rcu_read_unlock();
+               err = ptr ? 0 : -ENOENT;
+       }
  
-       err = -ENOENT;
-       if (!ptr)
+       if (err)
                 goto free_value;
  
         err = -EFAULT;
-       if (copy_to_user(uvalue, value, map->value_size) != 0)
+       if (copy_to_user(uvalue, value, value_size) != 0)
                 goto free_value;
  
         err = 0;
@@ -298,6 +311,7 @@ static int map_update_elem(union bpf_attr *attr)
         int ufd = attr->map_fd;
         struct bpf_map *map;
         void *key, *value;
+       u32 value_size;
         struct fd f;
         int err;
  
@@ -318,21 +332,30 @@ static int map_update_elem(union bpf_attr *attr)
         if (copy_from_user(key, ukey, map->key_size) != 0)
                 goto free_key;
  
+       if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+           map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
+               value_size = round_up(map->value_size, 8) * num_possible_cpus();
+       else
+               value_size = map->value_size;
+
         err = -ENOMEM;
-       value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN);
+       value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
         if (!value)
                 goto free_key;
  
         err = -EFAULT;
-       if (copy_from_user(value, uvalue, map->value_size) != 0)
+       if (copy_from_user(value, uvalue, value_size) != 0)
                 goto free_value;
  
-       /* eBPF program that use maps are running under rcu_read_lock(),
-        * therefore all map accessors rely on this fact, so do the same here
-        */
-       rcu_read_lock();
-       err = map->ops->map_update_elem(map, key, value, attr->flags);
-       rcu_read_unlock();
+       if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) {
+               err = bpf_percpu_hash_update(map, key, value, attr->flags);
+       } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
+               err = bpf_percpu_array_update(map, key, value, attr->flags);
+       } else {
+               rcu_read_lock();
+               err = map->ops->map_update_elem(map, key, value, attr->flags);
+               rcu_read_unlock();
+       }
  
  free_value:
         kfree(value);
author	Alexei Starovoitov <ast@fb.com>
	Tue, 2 Feb 2016 06:39:55 +0000 (22:39 -0800)
committer	David S. Miller <davem@davemloft.net>
	Sat, 6 Feb 2016 08:34:36 +0000 (03:34 -0500)
include/linux/bpf.h		patch \| blob \| history
kernel/bpf/arraymap.c		patch \| blob \| history
kernel/bpf/hashtab.c		patch \| blob \| history
kernel/bpf/syscall.c		patch \| blob \| history