OSDN Git Service

net: do not create fallback tunnels for non-default namespaces
authorEric Dumazet <edumazet@google.com>
Thu, 8 Mar 2018 20:51:41 +0000 (12:51 -0800)
committerDavid S. Miller <davem@davemloft.net>
Fri, 9 Mar 2018 16:23:11 +0000 (11:23 -0500)
fallback tunnels (like tunl0, gre0, gretap0, erspan0, sit0,
ip6tnl0, ip6gre0) are automatically created when the corresponding
module is loaded.

These tunnels are also automatically created when a new network
namespace is created, at a great cost.

In many cases, netns are used for isolation purposes, and these
extra network devices are a waste of resources. We are using
thousands of netns per host, and hit the netns creation/delete
bottleneck a lot. (Many thanks to Kirill for recent work on this)

Add a new sysctl so that we can opt-out from this automatic creation.

Note that these tunnels are still created for the initial namespace,
to be the least intrusive for typical setups.

Tested:
lpk43:~# cat add_del_unshare.sh
for i in `seq 1 40`
do
 (for j in `seq 1 100` ; do  unshare -n /bin/true >/dev/null ; done) &
done
wait

lpk43:~# echo 0 >/proc/sys/net/core/fb_tunnels_only_for_init_net
lpk43:~# time ./add_del_unshare.sh

real 0m37.521s
user 0m0.886s
sys 7m7.084s
lpk43:~# echo 1 >/proc/sys/net/core/fb_tunnels_only_for_init_net
lpk43:~# time ./add_del_unshare.sh

real 0m4.761s
user 0m0.851s
sys 1m8.343s
lpk43:~#

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Documentation/sysctl/net.txt
include/linux/netdevice.h
include/net/ip_tunnels.h
net/core/sysctl_net_core.c
net/ipv4/ip_tunnel.c
net/ipv6/ip6_gre.c
net/ipv6/ip6_tunnel.c
net/ipv6/sit.c

index 35c62f5..5992602 100644 (file)
@@ -270,6 +270,18 @@ optmem_max
 Maximum ancillary buffer size allowed per socket. Ancillary data is a sequence
 of struct cmsghdr structures with appended data.
 
+fb_tunnels_only_for_init_net
+----------------------------
+
+Controls if fallback tunnels (like tunl0, gre0, gretap0, erspan0,
+sit0, ip6tnl0, ip6gre0) are automatically created when a new
+network namespace is created, if corresponding tunnel is present
+in initial network namespace.
+If set to 1, these devices are not automatically created, and
+user space is responsible for creating them if needed.
+
+Default : 0  (for compatibility reasons)
+
 2. /proc/sys/net/unix - Parameters for Unix domain sockets
 -------------------------------------------------------
 
index 95a613a..9711108 100644 (file)
@@ -585,6 +585,13 @@ struct netdev_queue {
 #endif
 } ____cacheline_aligned_in_smp;
 
+extern int sysctl_fb_tunnels_only_for_init_net;
+
+static inline bool net_has_fallback_tunnels(const struct net *net)
+{
+       return net == &init_net || !sysctl_fb_tunnels_only_for_init_net;
+}
+
 static inline int netdev_queue_numa_node_read(const struct netdev_queue *q)
 {
 #if defined(CONFIG_XPS) && defined(CONFIG_NUMA)
index cbe5add..540a4b4 100644 (file)
@@ -180,8 +180,10 @@ struct tnl_ptk_info {
 
 struct ip_tunnel_net {
        struct net_device *fb_tunnel_dev;
+       struct rtnl_link_ops *rtnl_link_ops;
        struct hlist_head tunnels[IP_TNL_HASH_SIZE];
        struct ip_tunnel __rcu *collect_md_tun;
+       int type;
 };
 
 static inline void ip_tunnel_key_init(struct ip_tunnel_key *key,
index d714f65..4f47f92 100644 (file)
@@ -32,6 +32,9 @@ static int max_skb_frags = MAX_SKB_FRAGS;
 
 static int net_msg_warn;       /* Unused, but still a sysctl */
 
+int sysctl_fb_tunnels_only_for_init_net __read_mostly = 0;
+EXPORT_SYMBOL(sysctl_fb_tunnels_only_for_init_net);
+
 #ifdef CONFIG_RPS
 static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
                                void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -513,6 +516,15 @@ static struct ctl_table net_core_table[] = {
                .proc_handler   = proc_dointvec_minmax,
                .extra1         = &zero,
        },
+       {
+               .procname       = "fb_tunnels_only_for_init_net",
+               .data           = &sysctl_fb_tunnels_only_for_init_net,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = &zero,
+               .extra2         = &one,
+       },
        { }
 };
 
index 602597d..5fcb17c 100644 (file)
@@ -347,8 +347,7 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net,
        struct net_device *dev;
        int t_hlen;
 
-       BUG_ON(!itn->fb_tunnel_dev);
-       dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
+       dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms);
        if (IS_ERR(dev))
                return ERR_CAST(dev);
 
@@ -822,7 +821,6 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
        struct net *net = t->net;
        struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
 
-       BUG_ON(!itn->fb_tunnel_dev);
        switch (cmd) {
        case SIOCGETTUNNEL:
                if (dev == itn->fb_tunnel_dev) {
@@ -847,7 +845,7 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
                                p->o_key = 0;
                }
 
-               t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
+               t = ip_tunnel_find(itn, p, itn->type);
 
                if (cmd == SIOCADDTUNNEL) {
                        if (!t) {
@@ -991,10 +989,15 @@ int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
        struct ip_tunnel_parm parms;
        unsigned int i;
 
+       itn->rtnl_link_ops = ops;
        for (i = 0; i < IP_TNL_HASH_SIZE; i++)
                INIT_HLIST_HEAD(&itn->tunnels[i]);
 
-       if (!ops) {
+       if (!ops || !net_has_fallback_tunnels(net)) {
+               struct ip_tunnel_net *it_init_net;
+
+               it_init_net = net_generic(&init_net, ip_tnl_net_id);
+               itn->type = it_init_net->type;
                itn->fb_tunnel_dev = NULL;
                return 0;
        }
@@ -1012,6 +1015,7 @@ int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
                itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
                itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
                ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
+               itn->type = itn->fb_tunnel_dev->type;
        }
        rtnl_unlock();
 
@@ -1019,10 +1023,10 @@ int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
 }
 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
 
-static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
+static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
+                             struct list_head *head,
                              struct rtnl_link_ops *ops)
 {
-       struct net *net = dev_net(itn->fb_tunnel_dev);
        struct net_device *dev, *aux;
        int h;
 
@@ -1054,7 +1058,7 @@ void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
        rtnl_lock();
        list_for_each_entry(net, net_list, exit_list) {
                itn = net_generic(net, id);
-               ip_tunnel_destroy(itn, &list, ops);
+               ip_tunnel_destroy(net, itn, &list, ops);
        }
        unregister_netdevice_many(&list);
        rtnl_unlock();
index 18a3dfb..7d8775c 100644 (file)
@@ -236,7 +236,7 @@ static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev,
                return t;
 
        dev = ign->fb_tunnel_dev;
-       if (dev->flags & IFF_UP)
+       if (dev && dev->flags & IFF_UP)
                return netdev_priv(dev);
 
        return NULL;
@@ -1472,6 +1472,8 @@ static int __net_init ip6gre_init_net(struct net *net)
        struct ip6gre_net *ign = net_generic(net, ip6gre_net_id);
        int err;
 
+       if (!net_has_fallback_tunnels(net))
+               return 0;
        ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6gre0",
                                          NET_NAME_UNKNOWN,
                                          ip6gre_tunnel_setup);
index 56c4967..5c045fa 100644 (file)
@@ -2205,6 +2205,8 @@ static int __net_init ip6_tnl_init_net(struct net *net)
        ip6n->tnls[0] = ip6n->tnls_wc;
        ip6n->tnls[1] = ip6n->tnls_r_l;
 
+       if (!net_has_fallback_tunnels(net))
+               return 0;
        err = -ENOMEM;
        ip6n->fb_tnl_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6tnl0",
                                        NET_NAME_UNKNOWN, ip6_tnl_dev_setup);
index a9c4ac6..8a4f8fd 100644 (file)
@@ -182,7 +182,7 @@ static void ipip6_tunnel_clone_6rd(struct net_device *dev, struct sit_net *sitn)
 #ifdef CONFIG_IPV6_SIT_6RD
        struct ip_tunnel *t = netdev_priv(dev);
 
-       if (dev == sitn->fb_tunnel_dev) {
+       if (dev == sitn->fb_tunnel_dev || !sitn->fb_tunnel_dev) {
                ipv6_addr_set(&t->ip6rd.prefix, htonl(0x20020000), 0, 0, 0);
                t->ip6rd.relay_prefix = 0;
                t->ip6rd.prefixlen = 16;
@@ -1835,6 +1835,9 @@ static int __net_init sit_init_net(struct net *net)
        sitn->tunnels[2] = sitn->tunnels_r;
        sitn->tunnels[3] = sitn->tunnels_r_l;
 
+       if (!net_has_fallback_tunnels(net))
+               return 0;
+
        sitn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "sit0",
                                           NET_NAME_UNKNOWN,
                                           ipip6_tunnel_setup);