OSDN Git Service

ceph: fix use-after-free on symlink traversal
[uclinux-h8/linux.git] / drivers / net / vxlan.c
1 /*
2  * VXLAN: Virtual eXtensible Local Area Network
3  *
4  * Copyright (c) 2012-2013 Vyatta Inc.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 as
8  * published by the Free Software Foundation.
9  */
10
11 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
12
13 #include <linux/kernel.h>
14 #include <linux/module.h>
15 #include <linux/errno.h>
16 #include <linux/slab.h>
17 #include <linux/udp.h>
18 #include <linux/igmp.h>
19 #include <linux/if_ether.h>
20 #include <linux/ethtool.h>
21 #include <net/arp.h>
22 #include <net/ndisc.h>
23 #include <net/ip.h>
24 #include <net/icmp.h>
25 #include <net/rtnetlink.h>
26 #include <net/inet_ecn.h>
27 #include <net/net_namespace.h>
28 #include <net/netns/generic.h>
29 #include <net/tun_proto.h>
30 #include <net/vxlan.h>
31
32 #if IS_ENABLED(CONFIG_IPV6)
33 #include <net/ip6_tunnel.h>
34 #include <net/ip6_checksum.h>
35 #endif
36
37 #define VXLAN_VERSION   "0.1"
38
39 #define PORT_HASH_BITS  8
40 #define PORT_HASH_SIZE  (1<<PORT_HASH_BITS)
41 #define FDB_AGE_DEFAULT 300 /* 5 min */
42 #define FDB_AGE_INTERVAL (10 * HZ)      /* rescan interval */
43
44 /* UDP port for VXLAN traffic.
45  * The IANA assigned port is 4789, but the Linux default is 8472
46  * for compatibility with early adopters.
47  */
48 static unsigned short vxlan_port __read_mostly = 8472;
49 module_param_named(udp_port, vxlan_port, ushort, 0444);
50 MODULE_PARM_DESC(udp_port, "Destination UDP port");
51
52 static bool log_ecn_error = true;
53 module_param(log_ecn_error, bool, 0644);
54 MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
55
56 static unsigned int vxlan_net_id;
57 static struct rtnl_link_ops vxlan_link_ops;
58
59 static const u8 all_zeros_mac[ETH_ALEN + 2];
60
61 static int vxlan_sock_add(struct vxlan_dev *vxlan);
62
63 static void vxlan_vs_del_dev(struct vxlan_dev *vxlan);
64
65 /* per-network namespace private data for this module */
66 struct vxlan_net {
67         struct list_head  vxlan_list;
68         struct hlist_head sock_list[PORT_HASH_SIZE];
69         spinlock_t        sock_lock;
70 };
71
72 /* Forwarding table entry */
73 struct vxlan_fdb {
74         struct hlist_node hlist;        /* linked list of entries */
75         struct rcu_head   rcu;
76         unsigned long     updated;      /* jiffies */
77         unsigned long     used;
78         struct list_head  remotes;
79         u8                eth_addr[ETH_ALEN];
80         u16               state;        /* see ndm_state */
81         __be32            vni;
82         u16               flags;        /* see ndm_flags and below */
83 };
84
85 #define NTF_VXLAN_ADDED_BY_USER 0x100
86
87 /* salt for hash table */
88 static u32 vxlan_salt __read_mostly;
89
90 static inline bool vxlan_collect_metadata(struct vxlan_sock *vs)
91 {
92         return vs->flags & VXLAN_F_COLLECT_METADATA ||
93                ip_tunnel_collect_metadata();
94 }
95
96 #if IS_ENABLED(CONFIG_IPV6)
97 static inline
98 bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b)
99 {
100         if (a->sa.sa_family != b->sa.sa_family)
101                 return false;
102         if (a->sa.sa_family == AF_INET6)
103                 return ipv6_addr_equal(&a->sin6.sin6_addr, &b->sin6.sin6_addr);
104         else
105                 return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr;
106 }
107
108 static int vxlan_nla_get_addr(union vxlan_addr *ip, struct nlattr *nla)
109 {
110         if (nla_len(nla) >= sizeof(struct in6_addr)) {
111                 ip->sin6.sin6_addr = nla_get_in6_addr(nla);
112                 ip->sa.sa_family = AF_INET6;
113                 return 0;
114         } else if (nla_len(nla) >= sizeof(__be32)) {
115                 ip->sin.sin_addr.s_addr = nla_get_in_addr(nla);
116                 ip->sa.sa_family = AF_INET;
117                 return 0;
118         } else {
119                 return -EAFNOSUPPORT;
120         }
121 }
122
123 static int vxlan_nla_put_addr(struct sk_buff *skb, int attr,
124                               const union vxlan_addr *ip)
125 {
126         if (ip->sa.sa_family == AF_INET6)
127                 return nla_put_in6_addr(skb, attr, &ip->sin6.sin6_addr);
128         else
129                 return nla_put_in_addr(skb, attr, ip->sin.sin_addr.s_addr);
130 }
131
132 #else /* !CONFIG_IPV6 */
133
134 static inline
135 bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b)
136 {
137         return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr;
138 }
139
140 static int vxlan_nla_get_addr(union vxlan_addr *ip, struct nlattr *nla)
141 {
142         if (nla_len(nla) >= sizeof(struct in6_addr)) {
143                 return -EAFNOSUPPORT;
144         } else if (nla_len(nla) >= sizeof(__be32)) {
145                 ip->sin.sin_addr.s_addr = nla_get_in_addr(nla);
146                 ip->sa.sa_family = AF_INET;
147                 return 0;
148         } else {
149                 return -EAFNOSUPPORT;
150         }
151 }
152
153 static int vxlan_nla_put_addr(struct sk_buff *skb, int attr,
154                               const union vxlan_addr *ip)
155 {
156         return nla_put_in_addr(skb, attr, ip->sin.sin_addr.s_addr);
157 }
158 #endif
159
160 /* Virtual Network hash table head */
161 static inline struct hlist_head *vni_head(struct vxlan_sock *vs, __be32 vni)
162 {
163         return &vs->vni_list[hash_32((__force u32)vni, VNI_HASH_BITS)];
164 }
165
166 /* Socket hash table head */
167 static inline struct hlist_head *vs_head(struct net *net, __be16 port)
168 {
169         struct vxlan_net *vn = net_generic(net, vxlan_net_id);
170
171         return &vn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)];
172 }
173
174 /* First remote destination for a forwarding entry.
175  * Guaranteed to be non-NULL because remotes are never deleted.
176  */
177 static inline struct vxlan_rdst *first_remote_rcu(struct vxlan_fdb *fdb)
178 {
179         return list_entry_rcu(fdb->remotes.next, struct vxlan_rdst, list);
180 }
181
182 static inline struct vxlan_rdst *first_remote_rtnl(struct vxlan_fdb *fdb)
183 {
184         return list_first_entry(&fdb->remotes, struct vxlan_rdst, list);
185 }
186
187 /* Find VXLAN socket based on network namespace, address family and UDP port
188  * and enabled unshareable flags.
189  */
190 static struct vxlan_sock *vxlan_find_sock(struct net *net, sa_family_t family,
191                                           __be16 port, u32 flags, int ifindex)
192 {
193         struct vxlan_sock *vs;
194
195         flags &= VXLAN_F_RCV_FLAGS;
196
197         hlist_for_each_entry_rcu(vs, vs_head(net, port), hlist) {
198                 if (inet_sk(vs->sock->sk)->inet_sport == port &&
199                     vxlan_get_sk_family(vs) == family &&
200                     vs->flags == flags &&
201                     vs->sock->sk->sk_bound_dev_if == ifindex)
202                         return vs;
203         }
204         return NULL;
205 }
206
207 static struct vxlan_dev *vxlan_vs_find_vni(struct vxlan_sock *vs, int ifindex,
208                                            __be32 vni)
209 {
210         struct vxlan_dev_node *node;
211
212         /* For flow based devices, map all packets to VNI 0 */
213         if (vs->flags & VXLAN_F_COLLECT_METADATA)
214                 vni = 0;
215
216         hlist_for_each_entry_rcu(node, vni_head(vs, vni), hlist) {
217                 if (node->vxlan->default_dst.remote_vni != vni)
218                         continue;
219
220                 if (IS_ENABLED(CONFIG_IPV6)) {
221                         const struct vxlan_config *cfg = &node->vxlan->cfg;
222
223                         if ((cfg->flags & VXLAN_F_IPV6_LINKLOCAL) &&
224                             cfg->remote_ifindex != ifindex)
225                                 continue;
226                 }
227
228                 return node->vxlan;
229         }
230
231         return NULL;
232 }
233
234 /* Look up VNI in a per net namespace table */
235 static struct vxlan_dev *vxlan_find_vni(struct net *net, int ifindex,
236                                         __be32 vni, sa_family_t family,
237                                         __be16 port, u32 flags)
238 {
239         struct vxlan_sock *vs;
240
241         vs = vxlan_find_sock(net, family, port, flags, ifindex);
242         if (!vs)
243                 return NULL;
244
245         return vxlan_vs_find_vni(vs, ifindex, vni);
246 }
247
248 /* Fill in neighbour message in skbuff. */
249 static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
250                           const struct vxlan_fdb *fdb,
251                           u32 portid, u32 seq, int type, unsigned int flags,
252                           const struct vxlan_rdst *rdst)
253 {
254         unsigned long now = jiffies;
255         struct nda_cacheinfo ci;
256         struct nlmsghdr *nlh;
257         struct ndmsg *ndm;
258         bool send_ip, send_eth;
259
260         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags);
261         if (nlh == NULL)
262                 return -EMSGSIZE;
263
264         ndm = nlmsg_data(nlh);
265         memset(ndm, 0, sizeof(*ndm));
266
267         send_eth = send_ip = true;
268
269         if (type == RTM_GETNEIGH) {
270                 send_ip = !vxlan_addr_any(&rdst->remote_ip);
271                 send_eth = !is_zero_ether_addr(fdb->eth_addr);
272                 ndm->ndm_family = send_ip ? rdst->remote_ip.sa.sa_family : AF_INET;
273         } else
274                 ndm->ndm_family = AF_BRIDGE;
275         ndm->ndm_state = fdb->state;
276         ndm->ndm_ifindex = vxlan->dev->ifindex;
277         ndm->ndm_flags = fdb->flags;
278         if (rdst->offloaded)
279                 ndm->ndm_flags |= NTF_OFFLOADED;
280         ndm->ndm_type = RTN_UNICAST;
281
282         if (!net_eq(dev_net(vxlan->dev), vxlan->net) &&
283             nla_put_s32(skb, NDA_LINK_NETNSID,
284                         peernet2id(dev_net(vxlan->dev), vxlan->net)))
285                 goto nla_put_failure;
286
287         if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr))
288                 goto nla_put_failure;
289
290         if (send_ip && vxlan_nla_put_addr(skb, NDA_DST, &rdst->remote_ip))
291                 goto nla_put_failure;
292
293         if (rdst->remote_port && rdst->remote_port != vxlan->cfg.dst_port &&
294             nla_put_be16(skb, NDA_PORT, rdst->remote_port))
295                 goto nla_put_failure;
296         if (rdst->remote_vni != vxlan->default_dst.remote_vni &&
297             nla_put_u32(skb, NDA_VNI, be32_to_cpu(rdst->remote_vni)))
298                 goto nla_put_failure;
299         if ((vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) && fdb->vni &&
300             nla_put_u32(skb, NDA_SRC_VNI,
301                         be32_to_cpu(fdb->vni)))
302                 goto nla_put_failure;
303         if (rdst->remote_ifindex &&
304             nla_put_u32(skb, NDA_IFINDEX, rdst->remote_ifindex))
305                 goto nla_put_failure;
306
307         ci.ndm_used      = jiffies_to_clock_t(now - fdb->used);
308         ci.ndm_confirmed = 0;
309         ci.ndm_updated   = jiffies_to_clock_t(now - fdb->updated);
310         ci.ndm_refcnt    = 0;
311
312         if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci))
313                 goto nla_put_failure;
314
315         nlmsg_end(skb, nlh);
316         return 0;
317
318 nla_put_failure:
319         nlmsg_cancel(skb, nlh);
320         return -EMSGSIZE;
321 }
322
323 static inline size_t vxlan_nlmsg_size(void)
324 {
325         return NLMSG_ALIGN(sizeof(struct ndmsg))
326                 + nla_total_size(ETH_ALEN) /* NDA_LLADDR */
327                 + nla_total_size(sizeof(struct in6_addr)) /* NDA_DST */
328                 + nla_total_size(sizeof(__be16)) /* NDA_PORT */
329                 + nla_total_size(sizeof(__be32)) /* NDA_VNI */
330                 + nla_total_size(sizeof(__u32)) /* NDA_IFINDEX */
331                 + nla_total_size(sizeof(__s32)) /* NDA_LINK_NETNSID */
332                 + nla_total_size(sizeof(struct nda_cacheinfo));
333 }
334
335 static void __vxlan_fdb_notify(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb,
336                                struct vxlan_rdst *rd, int type)
337 {
338         struct net *net = dev_net(vxlan->dev);
339         struct sk_buff *skb;
340         int err = -ENOBUFS;
341
342         skb = nlmsg_new(vxlan_nlmsg_size(), GFP_ATOMIC);
343         if (skb == NULL)
344                 goto errout;
345
346         err = vxlan_fdb_info(skb, vxlan, fdb, 0, 0, type, 0, rd);
347         if (err < 0) {
348                 /* -EMSGSIZE implies BUG in vxlan_nlmsg_size() */
349                 WARN_ON(err == -EMSGSIZE);
350                 kfree_skb(skb);
351                 goto errout;
352         }
353
354         rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
355         return;
356 errout:
357         if (err < 0)
358                 rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
359 }
360
361 static void vxlan_fdb_switchdev_notifier_info(const struct vxlan_dev *vxlan,
362                             const struct vxlan_fdb *fdb,
363                             const struct vxlan_rdst *rd,
364                             struct netlink_ext_ack *extack,
365                             struct switchdev_notifier_vxlan_fdb_info *fdb_info)
366 {
367         fdb_info->info.dev = vxlan->dev;
368         fdb_info->info.extack = extack;
369         fdb_info->remote_ip = rd->remote_ip;
370         fdb_info->remote_port = rd->remote_port;
371         fdb_info->remote_vni = rd->remote_vni;
372         fdb_info->remote_ifindex = rd->remote_ifindex;
373         memcpy(fdb_info->eth_addr, fdb->eth_addr, ETH_ALEN);
374         fdb_info->vni = fdb->vni;
375         fdb_info->offloaded = rd->offloaded;
376         fdb_info->added_by_user = fdb->flags & NTF_VXLAN_ADDED_BY_USER;
377 }
378
379 static int vxlan_fdb_switchdev_call_notifiers(struct vxlan_dev *vxlan,
380                                               struct vxlan_fdb *fdb,
381                                               struct vxlan_rdst *rd,
382                                               bool adding,
383                                               struct netlink_ext_ack *extack)
384 {
385         struct switchdev_notifier_vxlan_fdb_info info;
386         enum switchdev_notifier_type notifier_type;
387         int ret;
388
389         if (WARN_ON(!rd))
390                 return 0;
391
392         notifier_type = adding ? SWITCHDEV_VXLAN_FDB_ADD_TO_DEVICE
393                                : SWITCHDEV_VXLAN_FDB_DEL_TO_DEVICE;
394         vxlan_fdb_switchdev_notifier_info(vxlan, fdb, rd, NULL, &info);
395         ret = call_switchdev_notifiers(notifier_type, vxlan->dev,
396                                        &info.info, extack);
397         return notifier_to_errno(ret);
398 }
399
400 static int vxlan_fdb_notify(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb,
401                             struct vxlan_rdst *rd, int type, bool swdev_notify,
402                             struct netlink_ext_ack *extack)
403 {
404         int err;
405
406         if (swdev_notify) {
407                 switch (type) {
408                 case RTM_NEWNEIGH:
409                         err = vxlan_fdb_switchdev_call_notifiers(vxlan, fdb, rd,
410                                                                  true, extack);
411                         if (err)
412                                 return err;
413                         break;
414                 case RTM_DELNEIGH:
415                         vxlan_fdb_switchdev_call_notifiers(vxlan, fdb, rd,
416                                                            false, extack);
417                         break;
418                 }
419         }
420
421         __vxlan_fdb_notify(vxlan, fdb, rd, type);
422         return 0;
423 }
424
425 static void vxlan_ip_miss(struct net_device *dev, union vxlan_addr *ipa)
426 {
427         struct vxlan_dev *vxlan = netdev_priv(dev);
428         struct vxlan_fdb f = {
429                 .state = NUD_STALE,
430         };
431         struct vxlan_rdst remote = {
432                 .remote_ip = *ipa, /* goes to NDA_DST */
433                 .remote_vni = cpu_to_be32(VXLAN_N_VID),
434         };
435
436         vxlan_fdb_notify(vxlan, &f, &remote, RTM_GETNEIGH, true, NULL);
437 }
438
439 static void vxlan_fdb_miss(struct vxlan_dev *vxlan, const u8 eth_addr[ETH_ALEN])
440 {
441         struct vxlan_fdb f = {
442                 .state = NUD_STALE,
443         };
444         struct vxlan_rdst remote = { };
445
446         memcpy(f.eth_addr, eth_addr, ETH_ALEN);
447
448         vxlan_fdb_notify(vxlan, &f, &remote, RTM_GETNEIGH, true, NULL);
449 }
450
451 /* Hash Ethernet address */
452 static u32 eth_hash(const unsigned char *addr)
453 {
454         u64 value = get_unaligned((u64 *)addr);
455
456         /* only want 6 bytes */
457 #ifdef __BIG_ENDIAN
458         value >>= 16;
459 #else
460         value <<= 16;
461 #endif
462         return hash_64(value, FDB_HASH_BITS);
463 }
464
465 static u32 eth_vni_hash(const unsigned char *addr, __be32 vni)
466 {
467         /* use 1 byte of OUI and 3 bytes of NIC */
468         u32 key = get_unaligned((u32 *)(addr + 2));
469
470         return jhash_2words(key, vni, vxlan_salt) & (FDB_HASH_SIZE - 1);
471 }
472
473 /* Hash chain to use given mac address */
474 static inline struct hlist_head *vxlan_fdb_head(struct vxlan_dev *vxlan,
475                                                 const u8 *mac, __be32 vni)
476 {
477         if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA)
478                 return &vxlan->fdb_head[eth_vni_hash(mac, vni)];
479         else
480                 return &vxlan->fdb_head[eth_hash(mac)];
481 }
482
483 /* Look up Ethernet address in forwarding table */
484 static struct vxlan_fdb *__vxlan_find_mac(struct vxlan_dev *vxlan,
485                                           const u8 *mac, __be32 vni)
486 {
487         struct hlist_head *head = vxlan_fdb_head(vxlan, mac, vni);
488         struct vxlan_fdb *f;
489
490         hlist_for_each_entry_rcu(f, head, hlist) {
491                 if (ether_addr_equal(mac, f->eth_addr)) {
492                         if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) {
493                                 if (vni == f->vni)
494                                         return f;
495                         } else {
496                                 return f;
497                         }
498                 }
499         }
500
501         return NULL;
502 }
503
504 static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan,
505                                         const u8 *mac, __be32 vni)
506 {
507         struct vxlan_fdb *f;
508
509         f = __vxlan_find_mac(vxlan, mac, vni);
510         if (f && f->used != jiffies)
511                 f->used = jiffies;
512
513         return f;
514 }
515
516 /* caller should hold vxlan->hash_lock */
517 static struct vxlan_rdst *vxlan_fdb_find_rdst(struct vxlan_fdb *f,
518                                               union vxlan_addr *ip, __be16 port,
519                                               __be32 vni, __u32 ifindex)
520 {
521         struct vxlan_rdst *rd;
522
523         list_for_each_entry(rd, &f->remotes, list) {
524                 if (vxlan_addr_equal(&rd->remote_ip, ip) &&
525                     rd->remote_port == port &&
526                     rd->remote_vni == vni &&
527                     rd->remote_ifindex == ifindex)
528                         return rd;
529         }
530
531         return NULL;
532 }
533
534 int vxlan_fdb_find_uc(struct net_device *dev, const u8 *mac, __be32 vni,
535                       struct switchdev_notifier_vxlan_fdb_info *fdb_info)
536 {
537         struct vxlan_dev *vxlan = netdev_priv(dev);
538         u8 eth_addr[ETH_ALEN + 2] = { 0 };
539         struct vxlan_rdst *rdst;
540         struct vxlan_fdb *f;
541         int rc = 0;
542
543         if (is_multicast_ether_addr(mac) ||
544             is_zero_ether_addr(mac))
545                 return -EINVAL;
546
547         ether_addr_copy(eth_addr, mac);
548
549         rcu_read_lock();
550
551         f = __vxlan_find_mac(vxlan, eth_addr, vni);
552         if (!f) {
553                 rc = -ENOENT;
554                 goto out;
555         }
556
557         rdst = first_remote_rcu(f);
558         vxlan_fdb_switchdev_notifier_info(vxlan, f, rdst, NULL, fdb_info);
559
560 out:
561         rcu_read_unlock();
562         return rc;
563 }
564 EXPORT_SYMBOL_GPL(vxlan_fdb_find_uc);
565
566 static int vxlan_fdb_notify_one(struct notifier_block *nb,
567                                 const struct vxlan_dev *vxlan,
568                                 const struct vxlan_fdb *f,
569                                 const struct vxlan_rdst *rdst,
570                                 struct netlink_ext_ack *extack)
571 {
572         struct switchdev_notifier_vxlan_fdb_info fdb_info;
573         int rc;
574
575         vxlan_fdb_switchdev_notifier_info(vxlan, f, rdst, extack, &fdb_info);
576         rc = nb->notifier_call(nb, SWITCHDEV_VXLAN_FDB_ADD_TO_DEVICE,
577                                &fdb_info);
578         return notifier_to_errno(rc);
579 }
580
581 int vxlan_fdb_replay(const struct net_device *dev, __be32 vni,
582                      struct notifier_block *nb,
583                      struct netlink_ext_ack *extack)
584 {
585         struct vxlan_dev *vxlan;
586         struct vxlan_rdst *rdst;
587         struct vxlan_fdb *f;
588         unsigned int h;
589         int rc = 0;
590
591         if (!netif_is_vxlan(dev))
592                 return -EINVAL;
593         vxlan = netdev_priv(dev);
594
595         spin_lock_bh(&vxlan->hash_lock);
596         for (h = 0; h < FDB_HASH_SIZE; ++h) {
597                 hlist_for_each_entry(f, &vxlan->fdb_head[h], hlist) {
598                         if (f->vni == vni) {
599                                 list_for_each_entry(rdst, &f->remotes, list) {
600                                         rc = vxlan_fdb_notify_one(nb, vxlan,
601                                                                   f, rdst,
602                                                                   extack);
603                                         if (rc)
604                                                 goto out;
605                                 }
606                         }
607                 }
608         }
609
610 out:
611         spin_unlock_bh(&vxlan->hash_lock);
612         return rc;
613 }
614 EXPORT_SYMBOL_GPL(vxlan_fdb_replay);
615
616 void vxlan_fdb_clear_offload(const struct net_device *dev, __be32 vni)
617 {
618         struct vxlan_dev *vxlan;
619         struct vxlan_rdst *rdst;
620         struct vxlan_fdb *f;
621         unsigned int h;
622
623         if (!netif_is_vxlan(dev))
624                 return;
625         vxlan = netdev_priv(dev);
626
627         spin_lock_bh(&vxlan->hash_lock);
628         for (h = 0; h < FDB_HASH_SIZE; ++h) {
629                 hlist_for_each_entry(f, &vxlan->fdb_head[h], hlist)
630                         if (f->vni == vni)
631                                 list_for_each_entry(rdst, &f->remotes, list)
632                                         rdst->offloaded = false;
633         }
634         spin_unlock_bh(&vxlan->hash_lock);
635 }
636 EXPORT_SYMBOL_GPL(vxlan_fdb_clear_offload);
637
638 /* Replace destination of unicast mac */
639 static int vxlan_fdb_replace(struct vxlan_fdb *f,
640                              union vxlan_addr *ip, __be16 port, __be32 vni,
641                              __u32 ifindex, struct vxlan_rdst *oldrd)
642 {
643         struct vxlan_rdst *rd;
644
645         rd = vxlan_fdb_find_rdst(f, ip, port, vni, ifindex);
646         if (rd)
647                 return 0;
648
649         rd = list_first_entry_or_null(&f->remotes, struct vxlan_rdst, list);
650         if (!rd)
651                 return 0;
652
653         *oldrd = *rd;
654         dst_cache_reset(&rd->dst_cache);
655         rd->remote_ip = *ip;
656         rd->remote_port = port;
657         rd->remote_vni = vni;
658         rd->remote_ifindex = ifindex;
659         rd->offloaded = false;
660         return 1;
661 }
662
663 /* Add/update destinations for multicast */
664 static int vxlan_fdb_append(struct vxlan_fdb *f,
665                             union vxlan_addr *ip, __be16 port, __be32 vni,
666                             __u32 ifindex, struct vxlan_rdst **rdp)
667 {
668         struct vxlan_rdst *rd;
669
670         rd = vxlan_fdb_find_rdst(f, ip, port, vni, ifindex);
671         if (rd)
672                 return 0;
673
674         rd = kmalloc(sizeof(*rd), GFP_ATOMIC);
675         if (rd == NULL)
676                 return -ENOBUFS;
677
678         if (dst_cache_init(&rd->dst_cache, GFP_ATOMIC)) {
679                 kfree(rd);
680                 return -ENOBUFS;
681         }
682
683         rd->remote_ip = *ip;
684         rd->remote_port = port;
685         rd->offloaded = false;
686         rd->remote_vni = vni;
687         rd->remote_ifindex = ifindex;
688
689         list_add_tail_rcu(&rd->list, &f->remotes);
690
691         *rdp = rd;
692         return 1;
693 }
694
695 static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb,
696                                           unsigned int off,
697                                           struct vxlanhdr *vh, size_t hdrlen,
698                                           __be32 vni_field,
699                                           struct gro_remcsum *grc,
700                                           bool nopartial)
701 {
702         size_t start, offset;
703
704         if (skb->remcsum_offload)
705                 return vh;
706
707         if (!NAPI_GRO_CB(skb)->csum_valid)
708                 return NULL;
709
710         start = vxlan_rco_start(vni_field);
711         offset = start + vxlan_rco_offset(vni_field);
712
713         vh = skb_gro_remcsum_process(skb, (void *)vh, off, hdrlen,
714                                      start, offset, grc, nopartial);
715
716         skb->remcsum_offload = 1;
717
718         return vh;
719 }
720
721 static struct sk_buff *vxlan_gro_receive(struct sock *sk,
722                                          struct list_head *head,
723                                          struct sk_buff *skb)
724 {
725         struct sk_buff *pp = NULL;
726         struct sk_buff *p;
727         struct vxlanhdr *vh, *vh2;
728         unsigned int hlen, off_vx;
729         int flush = 1;
730         struct vxlan_sock *vs = rcu_dereference_sk_user_data(sk);
731         __be32 flags;
732         struct gro_remcsum grc;
733
734         skb_gro_remcsum_init(&grc);
735
736         off_vx = skb_gro_offset(skb);
737         hlen = off_vx + sizeof(*vh);
738         vh   = skb_gro_header_fast(skb, off_vx);
739         if (skb_gro_header_hard(skb, hlen)) {
740                 vh = skb_gro_header_slow(skb, hlen, off_vx);
741                 if (unlikely(!vh))
742                         goto out;
743         }
744
745         skb_gro_postpull_rcsum(skb, vh, sizeof(struct vxlanhdr));
746
747         flags = vh->vx_flags;
748
749         if ((flags & VXLAN_HF_RCO) && (vs->flags & VXLAN_F_REMCSUM_RX)) {
750                 vh = vxlan_gro_remcsum(skb, off_vx, vh, sizeof(struct vxlanhdr),
751                                        vh->vx_vni, &grc,
752                                        !!(vs->flags &
753                                           VXLAN_F_REMCSUM_NOPARTIAL));
754
755                 if (!vh)
756                         goto out;
757         }
758
759         skb_gro_pull(skb, sizeof(struct vxlanhdr)); /* pull vxlan header */
760
761         list_for_each_entry(p, head, list) {
762                 if (!NAPI_GRO_CB(p)->same_flow)
763                         continue;
764
765                 vh2 = (struct vxlanhdr *)(p->data + off_vx);
766                 if (vh->vx_flags != vh2->vx_flags ||
767                     vh->vx_vni != vh2->vx_vni) {
768                         NAPI_GRO_CB(p)->same_flow = 0;
769                         continue;
770                 }
771         }
772
773         pp = call_gro_receive(eth_gro_receive, head, skb);
774         flush = 0;
775
776 out:
777         skb_gro_flush_final_remcsum(skb, pp, flush, &grc);
778
779         return pp;
780 }
781
782 static int vxlan_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff)
783 {
784         /* Sets 'skb->inner_mac_header' since we are always called with
785          * 'skb->encapsulation' set.
786          */
787         return eth_gro_complete(skb, nhoff + sizeof(struct vxlanhdr));
788 }
789
790 static struct vxlan_fdb *vxlan_fdb_alloc(struct vxlan_dev *vxlan,
791                                          const u8 *mac, __u16 state,
792                                          __be32 src_vni, __u16 ndm_flags)
793 {
794         struct vxlan_fdb *f;
795
796         f = kmalloc(sizeof(*f), GFP_ATOMIC);
797         if (!f)
798                 return NULL;
799         f->state = state;
800         f->flags = ndm_flags;
801         f->updated = f->used = jiffies;
802         f->vni = src_vni;
803         INIT_LIST_HEAD(&f->remotes);
804         memcpy(f->eth_addr, mac, ETH_ALEN);
805
806         return f;
807 }
808
809 static int vxlan_fdb_create(struct vxlan_dev *vxlan,
810                             const u8 *mac, union vxlan_addr *ip,
811                             __u16 state, __be16 port, __be32 src_vni,
812                             __be32 vni, __u32 ifindex, __u16 ndm_flags,
813                             struct vxlan_fdb **fdb)
814 {
815         struct vxlan_rdst *rd = NULL;
816         struct vxlan_fdb *f;
817         int rc;
818
819         if (vxlan->cfg.addrmax &&
820             vxlan->addrcnt >= vxlan->cfg.addrmax)
821                 return -ENOSPC;
822
823         netdev_dbg(vxlan->dev, "add %pM -> %pIS\n", mac, ip);
824         f = vxlan_fdb_alloc(vxlan, mac, state, src_vni, ndm_flags);
825         if (!f)
826                 return -ENOMEM;
827
828         rc = vxlan_fdb_append(f, ip, port, vni, ifindex, &rd);
829         if (rc < 0) {
830                 kfree(f);
831                 return rc;
832         }
833
834         ++vxlan->addrcnt;
835         hlist_add_head_rcu(&f->hlist,
836                            vxlan_fdb_head(vxlan, mac, src_vni));
837
838         *fdb = f;
839
840         return 0;
841 }
842
843 static void vxlan_fdb_free(struct rcu_head *head)
844 {
845         struct vxlan_fdb *f = container_of(head, struct vxlan_fdb, rcu);
846         struct vxlan_rdst *rd, *nd;
847
848         list_for_each_entry_safe(rd, nd, &f->remotes, list) {
849                 dst_cache_destroy(&rd->dst_cache);
850                 kfree(rd);
851         }
852         kfree(f);
853 }
854
855 static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f,
856                               bool do_notify, bool swdev_notify)
857 {
858         struct vxlan_rdst *rd;
859
860         netdev_dbg(vxlan->dev, "delete %pM\n", f->eth_addr);
861
862         --vxlan->addrcnt;
863         if (do_notify)
864                 list_for_each_entry(rd, &f->remotes, list)
865                         vxlan_fdb_notify(vxlan, f, rd, RTM_DELNEIGH,
866                                          swdev_notify, NULL);
867
868         hlist_del_rcu(&f->hlist);
869         call_rcu(&f->rcu, vxlan_fdb_free);
870 }
871
872 static void vxlan_dst_free(struct rcu_head *head)
873 {
874         struct vxlan_rdst *rd = container_of(head, struct vxlan_rdst, rcu);
875
876         dst_cache_destroy(&rd->dst_cache);
877         kfree(rd);
878 }
879
880 static int vxlan_fdb_update_existing(struct vxlan_dev *vxlan,
881                                      union vxlan_addr *ip,
882                                      __u16 state, __u16 flags,
883                                      __be16 port, __be32 vni,
884                                      __u32 ifindex, __u16 ndm_flags,
885                                      struct vxlan_fdb *f,
886                                      bool swdev_notify,
887                                      struct netlink_ext_ack *extack)
888 {
889         __u16 fdb_flags = (ndm_flags & ~NTF_USE);
890         struct vxlan_rdst *rd = NULL;
891         struct vxlan_rdst oldrd;
892         int notify = 0;
893         int rc = 0;
894         int err;
895
896         /* Do not allow an externally learned entry to take over an entry added
897          * by the user.
898          */
899         if (!(fdb_flags & NTF_EXT_LEARNED) ||
900             !(f->flags & NTF_VXLAN_ADDED_BY_USER)) {
901                 if (f->state != state) {
902                         f->state = state;
903                         f->updated = jiffies;
904                         notify = 1;
905                 }
906                 if (f->flags != fdb_flags) {
907                         f->flags = fdb_flags;
908                         f->updated = jiffies;
909                         notify = 1;
910                 }
911         }
912
913         if ((flags & NLM_F_REPLACE)) {
914                 /* Only change unicasts */
915                 if (!(is_multicast_ether_addr(f->eth_addr) ||
916                       is_zero_ether_addr(f->eth_addr))) {
917                         rc = vxlan_fdb_replace(f, ip, port, vni,
918                                                ifindex, &oldrd);
919                         notify |= rc;
920                 } else {
921                         return -EOPNOTSUPP;
922                 }
923         }
924         if ((flags & NLM_F_APPEND) &&
925             (is_multicast_ether_addr(f->eth_addr) ||
926              is_zero_ether_addr(f->eth_addr))) {
927                 rc = vxlan_fdb_append(f, ip, port, vni, ifindex, &rd);
928
929                 if (rc < 0)
930                         return rc;
931                 notify |= rc;
932         }
933
934         if (ndm_flags & NTF_USE)
935                 f->used = jiffies;
936
937         if (notify) {
938                 if (rd == NULL)
939                         rd = first_remote_rtnl(f);
940
941                 err = vxlan_fdb_notify(vxlan, f, rd, RTM_NEWNEIGH,
942                                        swdev_notify, extack);
943                 if (err)
944                         goto err_notify;
945         }
946
947         return 0;
948
949 err_notify:
950         if ((flags & NLM_F_REPLACE) && rc)
951                 *rd = oldrd;
952         else if ((flags & NLM_F_APPEND) && rc) {
953                 list_del_rcu(&rd->list);
954                 call_rcu(&rd->rcu, vxlan_dst_free);
955         }
956         return err;
957 }
958
959 static int vxlan_fdb_update_create(struct vxlan_dev *vxlan,
960                                    const u8 *mac, union vxlan_addr *ip,
961                                    __u16 state, __u16 flags,
962                                    __be16 port, __be32 src_vni, __be32 vni,
963                                    __u32 ifindex, __u16 ndm_flags,
964                                    bool swdev_notify,
965                                    struct netlink_ext_ack *extack)
966 {
967         __u16 fdb_flags = (ndm_flags & ~NTF_USE);
968         struct vxlan_fdb *f;
969         int rc;
970
971         /* Disallow replace to add a multicast entry */
972         if ((flags & NLM_F_REPLACE) &&
973             (is_multicast_ether_addr(mac) || is_zero_ether_addr(mac)))
974                 return -EOPNOTSUPP;
975
976         netdev_dbg(vxlan->dev, "add %pM -> %pIS\n", mac, ip);
977         rc = vxlan_fdb_create(vxlan, mac, ip, state, port, src_vni,
978                               vni, ifindex, fdb_flags, &f);
979         if (rc < 0)
980                 return rc;
981
982         rc = vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f), RTM_NEWNEIGH,
983                               swdev_notify, extack);
984         if (rc)
985                 goto err_notify;
986
987         return 0;
988
989 err_notify:
990         vxlan_fdb_destroy(vxlan, f, false, false);
991         return rc;
992 }
993
994 /* Add new entry to forwarding table -- assumes lock held */
995 static int vxlan_fdb_update(struct vxlan_dev *vxlan,
996                             const u8 *mac, union vxlan_addr *ip,
997                             __u16 state, __u16 flags,
998                             __be16 port, __be32 src_vni, __be32 vni,
999                             __u32 ifindex, __u16 ndm_flags,
1000                             bool swdev_notify,
1001                             struct netlink_ext_ack *extack)
1002 {
1003         struct vxlan_fdb *f;
1004
1005         f = __vxlan_find_mac(vxlan, mac, src_vni);
1006         if (f) {
1007                 if (flags & NLM_F_EXCL) {
1008                         netdev_dbg(vxlan->dev,
1009                                    "lost race to create %pM\n", mac);
1010                         return -EEXIST;
1011                 }
1012
1013                 return vxlan_fdb_update_existing(vxlan, ip, state, flags, port,
1014                                                  vni, ifindex, ndm_flags, f,
1015                                                  swdev_notify, extack);
1016         } else {
1017                 if (!(flags & NLM_F_CREATE))
1018                         return -ENOENT;
1019
1020                 return vxlan_fdb_update_create(vxlan, mac, ip, state, flags,
1021                                                port, src_vni, vni, ifindex,
1022                                                ndm_flags, swdev_notify, extack);
1023         }
1024 }
1025
1026 static void vxlan_fdb_dst_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f,
1027                                   struct vxlan_rdst *rd, bool swdev_notify)
1028 {
1029         list_del_rcu(&rd->list);
1030         vxlan_fdb_notify(vxlan, f, rd, RTM_DELNEIGH, swdev_notify, NULL);
1031         call_rcu(&rd->rcu, vxlan_dst_free);
1032 }
1033
1034 static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan,
1035                            union vxlan_addr *ip, __be16 *port, __be32 *src_vni,
1036                            __be32 *vni, u32 *ifindex)
1037 {
1038         struct net *net = dev_net(vxlan->dev);
1039         int err;
1040
1041         if (tb[NDA_DST]) {
1042                 err = vxlan_nla_get_addr(ip, tb[NDA_DST]);
1043                 if (err)
1044                         return err;
1045         } else {
1046                 union vxlan_addr *remote = &vxlan->default_dst.remote_ip;
1047                 if (remote->sa.sa_family == AF_INET) {
1048                         ip->sin.sin_addr.s_addr = htonl(INADDR_ANY);
1049                         ip->sa.sa_family = AF_INET;
1050 #if IS_ENABLED(CONFIG_IPV6)
1051                 } else {
1052                         ip->sin6.sin6_addr = in6addr_any;
1053                         ip->sa.sa_family = AF_INET6;
1054 #endif
1055                 }
1056         }
1057
1058         if (tb[NDA_PORT]) {
1059                 if (nla_len(tb[NDA_PORT]) != sizeof(__be16))
1060                         return -EINVAL;
1061                 *port = nla_get_be16(tb[NDA_PORT]);
1062         } else {
1063                 *port = vxlan->cfg.dst_port;
1064         }
1065
1066         if (tb[NDA_VNI]) {
1067                 if (nla_len(tb[NDA_VNI]) != sizeof(u32))
1068                         return -EINVAL;
1069                 *vni = cpu_to_be32(nla_get_u32(tb[NDA_VNI]));
1070         } else {
1071                 *vni = vxlan->default_dst.remote_vni;
1072         }
1073
1074         if (tb[NDA_SRC_VNI]) {
1075                 if (nla_len(tb[NDA_SRC_VNI]) != sizeof(u32))
1076                         return -EINVAL;
1077                 *src_vni = cpu_to_be32(nla_get_u32(tb[NDA_SRC_VNI]));
1078         } else {
1079                 *src_vni = vxlan->default_dst.remote_vni;
1080         }
1081
1082         if (tb[NDA_IFINDEX]) {
1083                 struct net_device *tdev;
1084
1085                 if (nla_len(tb[NDA_IFINDEX]) != sizeof(u32))
1086                         return -EINVAL;
1087                 *ifindex = nla_get_u32(tb[NDA_IFINDEX]);
1088                 tdev = __dev_get_by_index(net, *ifindex);
1089                 if (!tdev)
1090                         return -EADDRNOTAVAIL;
1091         } else {
1092                 *ifindex = 0;
1093         }
1094
1095         return 0;
1096 }
1097
1098 /* Add static entry (via netlink) */
1099 static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
1100                          struct net_device *dev,
1101                          const unsigned char *addr, u16 vid, u16 flags,
1102                          struct netlink_ext_ack *extack)
1103 {
1104         struct vxlan_dev *vxlan = netdev_priv(dev);
1105         /* struct net *net = dev_net(vxlan->dev); */
1106         union vxlan_addr ip;
1107         __be16 port;
1108         __be32 src_vni, vni;
1109         u32 ifindex;
1110         int err;
1111
1112         if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_REACHABLE))) {
1113                 pr_info("RTM_NEWNEIGH with invalid state %#x\n",
1114                         ndm->ndm_state);
1115                 return -EINVAL;
1116         }
1117
1118         if (tb[NDA_DST] == NULL)
1119                 return -EINVAL;
1120
1121         err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex);
1122         if (err)
1123                 return err;
1124
1125         if (vxlan->default_dst.remote_ip.sa.sa_family != ip.sa.sa_family)
1126                 return -EAFNOSUPPORT;
1127
1128         spin_lock_bh(&vxlan->hash_lock);
1129         err = vxlan_fdb_update(vxlan, addr, &ip, ndm->ndm_state, flags,
1130                                port, src_vni, vni, ifindex,
1131                                ndm->ndm_flags | NTF_VXLAN_ADDED_BY_USER,
1132                                true, extack);
1133         spin_unlock_bh(&vxlan->hash_lock);
1134
1135         return err;
1136 }
1137
1138 static int __vxlan_fdb_delete(struct vxlan_dev *vxlan,
1139                               const unsigned char *addr, union vxlan_addr ip,
1140                               __be16 port, __be32 src_vni, __be32 vni,
1141                               u32 ifindex, bool swdev_notify)
1142 {
1143         struct vxlan_fdb *f;
1144         struct vxlan_rdst *rd = NULL;
1145         int err = -ENOENT;
1146
1147         f = vxlan_find_mac(vxlan, addr, src_vni);
1148         if (!f)
1149                 return err;
1150
1151         if (!vxlan_addr_any(&ip)) {
1152                 rd = vxlan_fdb_find_rdst(f, &ip, port, vni, ifindex);
1153                 if (!rd)
1154                         goto out;
1155         }
1156
1157         /* remove a destination if it's not the only one on the list,
1158          * otherwise destroy the fdb entry
1159          */
1160         if (rd && !list_is_singular(&f->remotes)) {
1161                 vxlan_fdb_dst_destroy(vxlan, f, rd, swdev_notify);
1162                 goto out;
1163         }
1164
1165         vxlan_fdb_destroy(vxlan, f, true, swdev_notify);
1166
1167 out:
1168         return 0;
1169 }
1170
1171 /* Delete entry (via netlink) */
1172 static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
1173                             struct net_device *dev,
1174                             const unsigned char *addr, u16 vid)
1175 {
1176         struct vxlan_dev *vxlan = netdev_priv(dev);
1177         union vxlan_addr ip;
1178         __be32 src_vni, vni;
1179         __be16 port;
1180         u32 ifindex;
1181         int err;
1182
1183         err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex);
1184         if (err)
1185                 return err;
1186
1187         spin_lock_bh(&vxlan->hash_lock);
1188         err = __vxlan_fdb_delete(vxlan, addr, ip, port, src_vni, vni, ifindex,
1189                                  true);
1190         spin_unlock_bh(&vxlan->hash_lock);
1191
1192         return err;
1193 }
1194
1195 /* Dump forwarding table */
1196 static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
1197                           struct net_device *dev,
1198                           struct net_device *filter_dev, int *idx)
1199 {
1200         struct vxlan_dev *vxlan = netdev_priv(dev);
1201         unsigned int h;
1202         int err = 0;
1203
1204         for (h = 0; h < FDB_HASH_SIZE; ++h) {
1205                 struct vxlan_fdb *f;
1206
1207                 hlist_for_each_entry_rcu(f, &vxlan->fdb_head[h], hlist) {
1208                         struct vxlan_rdst *rd;
1209
1210                         list_for_each_entry_rcu(rd, &f->remotes, list) {
1211                                 if (*idx < cb->args[2])
1212                                         goto skip;
1213
1214                                 err = vxlan_fdb_info(skb, vxlan, f,
1215                                                      NETLINK_CB(cb->skb).portid,
1216                                                      cb->nlh->nlmsg_seq,
1217                                                      RTM_NEWNEIGH,
1218                                                      NLM_F_MULTI, rd);
1219                                 if (err < 0)
1220                                         goto out;
1221 skip:
1222                                 *idx += 1;
1223                         }
1224                 }
1225         }
1226 out:
1227         return err;
1228 }
1229
1230 static int vxlan_fdb_get(struct sk_buff *skb,
1231                          struct nlattr *tb[],
1232                          struct net_device *dev,
1233                          const unsigned char *addr,
1234                          u16 vid, u32 portid, u32 seq,
1235                          struct netlink_ext_ack *extack)
1236 {
1237         struct vxlan_dev *vxlan = netdev_priv(dev);
1238         struct vxlan_fdb *f;
1239         __be32 vni;
1240         int err;
1241
1242         if (tb[NDA_VNI])
1243                 vni = cpu_to_be32(nla_get_u32(tb[NDA_VNI]));
1244         else
1245                 vni = vxlan->default_dst.remote_vni;
1246
1247         rcu_read_lock();
1248
1249         f = __vxlan_find_mac(vxlan, addr, vni);
1250         if (!f) {
1251                 NL_SET_ERR_MSG(extack, "Fdb entry not found");
1252                 err = -ENOENT;
1253                 goto errout;
1254         }
1255
1256         err = vxlan_fdb_info(skb, vxlan, f, portid, seq,
1257                              RTM_NEWNEIGH, 0, first_remote_rcu(f));
1258 errout:
1259         rcu_read_unlock();
1260         return err;
1261 }
1262
1263 /* Watch incoming packets to learn mapping between Ethernet address
1264  * and Tunnel endpoint.
1265  * Return true if packet is bogus and should be dropped.
1266  */
1267 static bool vxlan_snoop(struct net_device *dev,
1268                         union vxlan_addr *src_ip, const u8 *src_mac,
1269                         u32 src_ifindex, __be32 vni)
1270 {
1271         struct vxlan_dev *vxlan = netdev_priv(dev);
1272         struct vxlan_fdb *f;
1273         u32 ifindex = 0;
1274
1275 #if IS_ENABLED(CONFIG_IPV6)
1276         if (src_ip->sa.sa_family == AF_INET6 &&
1277             (ipv6_addr_type(&src_ip->sin6.sin6_addr) & IPV6_ADDR_LINKLOCAL))
1278                 ifindex = src_ifindex;
1279 #endif
1280
1281         f = vxlan_find_mac(vxlan, src_mac, vni);
1282         if (likely(f)) {
1283                 struct vxlan_rdst *rdst = first_remote_rcu(f);
1284
1285                 if (likely(vxlan_addr_equal(&rdst->remote_ip, src_ip) &&
1286                            rdst->remote_ifindex == ifindex))
1287                         return false;
1288
1289                 /* Don't migrate static entries, drop packets */
1290                 if (f->state & (NUD_PERMANENT | NUD_NOARP))
1291                         return true;
1292
1293                 if (net_ratelimit())
1294                         netdev_info(dev,
1295                                     "%pM migrated from %pIS to %pIS\n",
1296                                     src_mac, &rdst->remote_ip.sa, &src_ip->sa);
1297
1298                 rdst->remote_ip = *src_ip;
1299                 f->updated = jiffies;
1300                 vxlan_fdb_notify(vxlan, f, rdst, RTM_NEWNEIGH, true, NULL);
1301         } else {
1302                 /* learned new entry */
1303                 spin_lock(&vxlan->hash_lock);
1304
1305                 /* close off race between vxlan_flush and incoming packets */
1306                 if (netif_running(dev))
1307                         vxlan_fdb_update(vxlan, src_mac, src_ip,
1308                                          NUD_REACHABLE,
1309                                          NLM_F_EXCL|NLM_F_CREATE,
1310                                          vxlan->cfg.dst_port,
1311                                          vni,
1312                                          vxlan->default_dst.remote_vni,
1313                                          ifindex, NTF_SELF, true, NULL);
1314                 spin_unlock(&vxlan->hash_lock);
1315         }
1316
1317         return false;
1318 }
1319
1320 /* See if multicast group is already in use by other ID */
1321 static bool vxlan_group_used(struct vxlan_net *vn, struct vxlan_dev *dev)
1322 {
1323         struct vxlan_dev *vxlan;
1324         struct vxlan_sock *sock4;
1325 #if IS_ENABLED(CONFIG_IPV6)
1326         struct vxlan_sock *sock6;
1327 #endif
1328         unsigned short family = dev->default_dst.remote_ip.sa.sa_family;
1329
1330         sock4 = rtnl_dereference(dev->vn4_sock);
1331
1332         /* The vxlan_sock is only used by dev, leaving group has
1333          * no effect on other vxlan devices.
1334          */
1335         if (family == AF_INET && sock4 && refcount_read(&sock4->refcnt) == 1)
1336                 return false;
1337 #if IS_ENABLED(CONFIG_IPV6)
1338         sock6 = rtnl_dereference(dev->vn6_sock);
1339         if (family == AF_INET6 && sock6 && refcount_read(&sock6->refcnt) == 1)
1340                 return false;
1341 #endif
1342
1343         list_for_each_entry(vxlan, &vn->vxlan_list, next) {
1344                 if (!netif_running(vxlan->dev) || vxlan == dev)
1345                         continue;
1346
1347                 if (family == AF_INET &&
1348                     rtnl_dereference(vxlan->vn4_sock) != sock4)
1349                         continue;
1350 #if IS_ENABLED(CONFIG_IPV6)
1351                 if (family == AF_INET6 &&
1352                     rtnl_dereference(vxlan->vn6_sock) != sock6)
1353                         continue;
1354 #endif
1355
1356                 if (!vxlan_addr_equal(&vxlan->default_dst.remote_ip,
1357                                       &dev->default_dst.remote_ip))
1358                         continue;
1359
1360                 if (vxlan->default_dst.remote_ifindex !=
1361                     dev->default_dst.remote_ifindex)
1362                         continue;
1363
1364                 return true;
1365         }
1366
1367         return false;
1368 }
1369
1370 static bool __vxlan_sock_release_prep(struct vxlan_sock *vs)
1371 {
1372         struct vxlan_net *vn;
1373
1374         if (!vs)
1375                 return false;
1376         if (!refcount_dec_and_test(&vs->refcnt))
1377                 return false;
1378
1379         vn = net_generic(sock_net(vs->sock->sk), vxlan_net_id);
1380         spin_lock(&vn->sock_lock);
1381         hlist_del_rcu(&vs->hlist);
1382         udp_tunnel_notify_del_rx_port(vs->sock,
1383                                       (vs->flags & VXLAN_F_GPE) ?
1384                                       UDP_TUNNEL_TYPE_VXLAN_GPE :
1385                                       UDP_TUNNEL_TYPE_VXLAN);
1386         spin_unlock(&vn->sock_lock);
1387
1388         return true;
1389 }
1390
1391 static void vxlan_sock_release(struct vxlan_dev *vxlan)
1392 {
1393         struct vxlan_sock *sock4 = rtnl_dereference(vxlan->vn4_sock);
1394 #if IS_ENABLED(CONFIG_IPV6)
1395         struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock);
1396
1397         RCU_INIT_POINTER(vxlan->vn6_sock, NULL);
1398 #endif
1399
1400         RCU_INIT_POINTER(vxlan->vn4_sock, NULL);
1401         synchronize_net();
1402
1403         vxlan_vs_del_dev(vxlan);
1404
1405         if (__vxlan_sock_release_prep(sock4)) {
1406                 udp_tunnel_sock_release(sock4->sock);
1407                 kfree(sock4);
1408         }
1409
1410 #if IS_ENABLED(CONFIG_IPV6)
1411         if (__vxlan_sock_release_prep(sock6)) {
1412                 udp_tunnel_sock_release(sock6->sock);
1413                 kfree(sock6);
1414         }
1415 #endif
1416 }
1417
1418 /* Update multicast group membership when first VNI on
1419  * multicast address is brought up
1420  */
1421 static int vxlan_igmp_join(struct vxlan_dev *vxlan)
1422 {
1423         struct sock *sk;
1424         union vxlan_addr *ip = &vxlan->default_dst.remote_ip;
1425         int ifindex = vxlan->default_dst.remote_ifindex;
1426         int ret = -EINVAL;
1427
1428         if (ip->sa.sa_family == AF_INET) {
1429                 struct vxlan_sock *sock4 = rtnl_dereference(vxlan->vn4_sock);
1430                 struct ip_mreqn mreq = {
1431                         .imr_multiaddr.s_addr   = ip->sin.sin_addr.s_addr,
1432                         .imr_ifindex            = ifindex,
1433                 };
1434
1435                 sk = sock4->sock->sk;
1436                 lock_sock(sk);
1437                 ret = ip_mc_join_group(sk, &mreq);
1438                 release_sock(sk);
1439 #if IS_ENABLED(CONFIG_IPV6)
1440         } else {
1441                 struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock);
1442
1443                 sk = sock6->sock->sk;
1444                 lock_sock(sk);
1445                 ret = ipv6_stub->ipv6_sock_mc_join(sk, ifindex,
1446                                                    &ip->sin6.sin6_addr);
1447                 release_sock(sk);
1448 #endif
1449         }
1450
1451         return ret;
1452 }
1453
1454 /* Inverse of vxlan_igmp_join when last VNI is brought down */
1455 static int vxlan_igmp_leave(struct vxlan_dev *vxlan)
1456 {
1457         struct sock *sk;
1458         union vxlan_addr *ip = &vxlan->default_dst.remote_ip;
1459         int ifindex = vxlan->default_dst.remote_ifindex;
1460         int ret = -EINVAL;
1461
1462         if (ip->sa.sa_family == AF_INET) {
1463                 struct vxlan_sock *sock4 = rtnl_dereference(vxlan->vn4_sock);
1464                 struct ip_mreqn mreq = {
1465                         .imr_multiaddr.s_addr   = ip->sin.sin_addr.s_addr,
1466                         .imr_ifindex            = ifindex,
1467                 };
1468
1469                 sk = sock4->sock->sk;
1470                 lock_sock(sk);
1471                 ret = ip_mc_leave_group(sk, &mreq);
1472                 release_sock(sk);
1473 #if IS_ENABLED(CONFIG_IPV6)
1474         } else {
1475                 struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock);
1476
1477                 sk = sock6->sock->sk;
1478                 lock_sock(sk);
1479                 ret = ipv6_stub->ipv6_sock_mc_drop(sk, ifindex,
1480                                                    &ip->sin6.sin6_addr);
1481                 release_sock(sk);
1482 #endif
1483         }
1484
1485         return ret;
1486 }
1487
1488 static bool vxlan_remcsum(struct vxlanhdr *unparsed,
1489                           struct sk_buff *skb, u32 vxflags)
1490 {
1491         size_t start, offset;
1492
1493         if (!(unparsed->vx_flags & VXLAN_HF_RCO) || skb->remcsum_offload)
1494                 goto out;
1495
1496         start = vxlan_rco_start(unparsed->vx_vni);
1497         offset = start + vxlan_rco_offset(unparsed->vx_vni);
1498
1499         if (!pskb_may_pull(skb, offset + sizeof(u16)))
1500                 return false;
1501
1502         skb_remcsum_process(skb, (void *)(vxlan_hdr(skb) + 1), start, offset,
1503                             !!(vxflags & VXLAN_F_REMCSUM_NOPARTIAL));
1504 out:
1505         unparsed->vx_flags &= ~VXLAN_HF_RCO;
1506         unparsed->vx_vni &= VXLAN_VNI_MASK;
1507         return true;
1508 }
1509
1510 static void vxlan_parse_gbp_hdr(struct vxlanhdr *unparsed,
1511                                 struct sk_buff *skb, u32 vxflags,
1512                                 struct vxlan_metadata *md)
1513 {
1514         struct vxlanhdr_gbp *gbp = (struct vxlanhdr_gbp *)unparsed;
1515         struct metadata_dst *tun_dst;
1516
1517         if (!(unparsed->vx_flags & VXLAN_HF_GBP))
1518                 goto out;
1519
1520         md->gbp = ntohs(gbp->policy_id);
1521
1522         tun_dst = (struct metadata_dst *)skb_dst(skb);
1523         if (tun_dst) {
1524                 tun_dst->u.tun_info.key.tun_flags |= TUNNEL_VXLAN_OPT;
1525                 tun_dst->u.tun_info.options_len = sizeof(*md);
1526         }
1527         if (gbp->dont_learn)
1528                 md->gbp |= VXLAN_GBP_DONT_LEARN;
1529
1530         if (gbp->policy_applied)
1531                 md->gbp |= VXLAN_GBP_POLICY_APPLIED;
1532
1533         /* In flow-based mode, GBP is carried in dst_metadata */
1534         if (!(vxflags & VXLAN_F_COLLECT_METADATA))
1535                 skb->mark = md->gbp;
1536 out:
1537         unparsed->vx_flags &= ~VXLAN_GBP_USED_BITS;
1538 }
1539
1540 static bool vxlan_parse_gpe_hdr(struct vxlanhdr *unparsed,
1541                                 __be16 *protocol,
1542                                 struct sk_buff *skb, u32 vxflags)
1543 {
1544         struct vxlanhdr_gpe *gpe = (struct vxlanhdr_gpe *)unparsed;
1545
1546         /* Need to have Next Protocol set for interfaces in GPE mode. */
1547         if (!gpe->np_applied)
1548                 return false;
1549         /* "The initial version is 0. If a receiver does not support the
1550          * version indicated it MUST drop the packet.
1551          */
1552         if (gpe->version != 0)
1553                 return false;
1554         /* "When the O bit is set to 1, the packet is an OAM packet and OAM
1555          * processing MUST occur." However, we don't implement OAM
1556          * processing, thus drop the packet.
1557          */
1558         if (gpe->oam_flag)
1559                 return false;
1560
1561         *protocol = tun_p_to_eth_p(gpe->next_protocol);
1562         if (!*protocol)
1563                 return false;
1564
1565         unparsed->vx_flags &= ~VXLAN_GPE_USED_BITS;
1566         return true;
1567 }
1568
1569 static bool vxlan_set_mac(struct vxlan_dev *vxlan,
1570                           struct vxlan_sock *vs,
1571                           struct sk_buff *skb, __be32 vni)
1572 {
1573         union vxlan_addr saddr;
1574         u32 ifindex = skb->dev->ifindex;
1575
1576         skb_reset_mac_header(skb);
1577         skb->protocol = eth_type_trans(skb, vxlan->dev);
1578         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1579
1580         /* Ignore packet loops (and multicast echo) */
1581         if (ether_addr_equal(eth_hdr(skb)->h_source, vxlan->dev->dev_addr))
1582                 return false;
1583
1584         /* Get address from the outer IP header */
1585         if (vxlan_get_sk_family(vs) == AF_INET) {
1586                 saddr.sin.sin_addr.s_addr = ip_hdr(skb)->saddr;
1587                 saddr.sa.sa_family = AF_INET;
1588 #if IS_ENABLED(CONFIG_IPV6)
1589         } else {
1590                 saddr.sin6.sin6_addr = ipv6_hdr(skb)->saddr;
1591                 saddr.sa.sa_family = AF_INET6;
1592 #endif
1593         }
1594
1595         if ((vxlan->cfg.flags & VXLAN_F_LEARN) &&
1596             vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source, ifindex, vni))
1597                 return false;
1598
1599         return true;
1600 }
1601
1602 static bool vxlan_ecn_decapsulate(struct vxlan_sock *vs, void *oiph,
1603                                   struct sk_buff *skb)
1604 {
1605         int err = 0;
1606
1607         if (vxlan_get_sk_family(vs) == AF_INET)
1608                 err = IP_ECN_decapsulate(oiph, skb);
1609 #if IS_ENABLED(CONFIG_IPV6)
1610         else
1611                 err = IP6_ECN_decapsulate(oiph, skb);
1612 #endif
1613
1614         if (unlikely(err) && log_ecn_error) {
1615                 if (vxlan_get_sk_family(vs) == AF_INET)
1616                         net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
1617                                              &((struct iphdr *)oiph)->saddr,
1618                                              ((struct iphdr *)oiph)->tos);
1619                 else
1620                         net_info_ratelimited("non-ECT from %pI6\n",
1621                                              &((struct ipv6hdr *)oiph)->saddr);
1622         }
1623         return err <= 1;
1624 }
1625
1626 /* Callback from net/ipv4/udp.c to receive packets */
1627 static int vxlan_rcv(struct sock *sk, struct sk_buff *skb)
1628 {
1629         struct pcpu_sw_netstats *stats;
1630         struct vxlan_dev *vxlan;
1631         struct vxlan_sock *vs;
1632         struct vxlanhdr unparsed;
1633         struct vxlan_metadata _md;
1634         struct vxlan_metadata *md = &_md;
1635         __be16 protocol = htons(ETH_P_TEB);
1636         bool raw_proto = false;
1637         void *oiph;
1638         __be32 vni = 0;
1639
1640         /* Need UDP and VXLAN header to be present */
1641         if (!pskb_may_pull(skb, VXLAN_HLEN))
1642                 goto drop;
1643
1644         unparsed = *vxlan_hdr(skb);
1645         /* VNI flag always required to be set */
1646         if (!(unparsed.vx_flags & VXLAN_HF_VNI)) {
1647                 netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n",
1648                            ntohl(vxlan_hdr(skb)->vx_flags),
1649                            ntohl(vxlan_hdr(skb)->vx_vni));
1650                 /* Return non vxlan pkt */
1651                 goto drop;
1652         }
1653         unparsed.vx_flags &= ~VXLAN_HF_VNI;
1654         unparsed.vx_vni &= ~VXLAN_VNI_MASK;
1655
1656         vs = rcu_dereference_sk_user_data(sk);
1657         if (!vs)
1658                 goto drop;
1659
1660         vni = vxlan_vni(vxlan_hdr(skb)->vx_vni);
1661
1662         vxlan = vxlan_vs_find_vni(vs, skb->dev->ifindex, vni);
1663         if (!vxlan)
1664                 goto drop;
1665
1666         /* For backwards compatibility, only allow reserved fields to be
1667          * used by VXLAN extensions if explicitly requested.
1668          */
1669         if (vs->flags & VXLAN_F_GPE) {
1670                 if (!vxlan_parse_gpe_hdr(&unparsed, &protocol, skb, vs->flags))
1671                         goto drop;
1672                 raw_proto = true;
1673         }
1674
1675         if (__iptunnel_pull_header(skb, VXLAN_HLEN, protocol, raw_proto,
1676                                    !net_eq(vxlan->net, dev_net(vxlan->dev))))
1677                         goto drop;
1678
1679         if (vxlan_collect_metadata(vs)) {
1680                 struct metadata_dst *tun_dst;
1681
1682                 tun_dst = udp_tun_rx_dst(skb, vxlan_get_sk_family(vs), TUNNEL_KEY,
1683                                          key32_to_tunnel_id(vni), sizeof(*md));
1684
1685                 if (!tun_dst)
1686                         goto drop;
1687
1688                 md = ip_tunnel_info_opts(&tun_dst->u.tun_info);
1689
1690                 skb_dst_set(skb, (struct dst_entry *)tun_dst);
1691         } else {
1692                 memset(md, 0, sizeof(*md));
1693         }
1694
1695         if (vs->flags & VXLAN_F_REMCSUM_RX)
1696                 if (!vxlan_remcsum(&unparsed, skb, vs->flags))
1697                         goto drop;
1698         if (vs->flags & VXLAN_F_GBP)
1699                 vxlan_parse_gbp_hdr(&unparsed, skb, vs->flags, md);
1700         /* Note that GBP and GPE can never be active together. This is
1701          * ensured in vxlan_dev_configure.
1702          */
1703
1704         if (unparsed.vx_flags || unparsed.vx_vni) {
1705                 /* If there are any unprocessed flags remaining treat
1706                  * this as a malformed packet. This behavior diverges from
1707                  * VXLAN RFC (RFC7348) which stipulates that bits in reserved
1708                  * in reserved fields are to be ignored. The approach here
1709                  * maintains compatibility with previous stack code, and also
1710                  * is more robust and provides a little more security in
1711                  * adding extensions to VXLAN.
1712                  */
1713                 goto drop;
1714         }
1715
1716         if (!raw_proto) {
1717                 if (!vxlan_set_mac(vxlan, vs, skb, vni))
1718                         goto drop;
1719         } else {
1720                 skb_reset_mac_header(skb);
1721                 skb->dev = vxlan->dev;
1722                 skb->pkt_type = PACKET_HOST;
1723         }
1724
1725         oiph = skb_network_header(skb);
1726         skb_reset_network_header(skb);
1727
1728         if (!vxlan_ecn_decapsulate(vs, oiph, skb)) {
1729                 ++vxlan->dev->stats.rx_frame_errors;
1730                 ++vxlan->dev->stats.rx_errors;
1731                 goto drop;
1732         }
1733
1734         rcu_read_lock();
1735
1736         if (unlikely(!(vxlan->dev->flags & IFF_UP))) {
1737                 rcu_read_unlock();
1738                 atomic_long_inc(&vxlan->dev->rx_dropped);
1739                 goto drop;
1740         }
1741
1742         stats = this_cpu_ptr(vxlan->dev->tstats);
1743         u64_stats_update_begin(&stats->syncp);
1744         stats->rx_packets++;
1745         stats->rx_bytes += skb->len;
1746         u64_stats_update_end(&stats->syncp);
1747
1748         gro_cells_receive(&vxlan->gro_cells, skb);
1749
1750         rcu_read_unlock();
1751
1752         return 0;
1753
1754 drop:
1755         /* Consume bad packet */
1756         kfree_skb(skb);
1757         return 0;
1758 }
1759
1760 /* Callback from net/ipv{4,6}/udp.c to check that we have a VNI for errors */
1761 static int vxlan_err_lookup(struct sock *sk, struct sk_buff *skb)
1762 {
1763         struct vxlan_dev *vxlan;
1764         struct vxlan_sock *vs;
1765         struct vxlanhdr *hdr;
1766         __be32 vni;
1767
1768         if (skb->len < VXLAN_HLEN)
1769                 return -EINVAL;
1770
1771         hdr = vxlan_hdr(skb);
1772
1773         if (!(hdr->vx_flags & VXLAN_HF_VNI))
1774                 return -EINVAL;
1775
1776         vs = rcu_dereference_sk_user_data(sk);
1777         if (!vs)
1778                 return -ENOENT;
1779
1780         vni = vxlan_vni(hdr->vx_vni);
1781         vxlan = vxlan_vs_find_vni(vs, skb->dev->ifindex, vni);
1782         if (!vxlan)
1783                 return -ENOENT;
1784
1785         return 0;
1786 }
1787
1788 static int arp_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni)
1789 {
1790         struct vxlan_dev *vxlan = netdev_priv(dev);
1791         struct arphdr *parp;
1792         u8 *arpptr, *sha;
1793         __be32 sip, tip;
1794         struct neighbour *n;
1795
1796         if (dev->flags & IFF_NOARP)
1797                 goto out;
1798
1799         if (!pskb_may_pull(skb, arp_hdr_len(dev))) {
1800                 dev->stats.tx_dropped++;
1801                 goto out;
1802         }
1803         parp = arp_hdr(skb);
1804
1805         if ((parp->ar_hrd != htons(ARPHRD_ETHER) &&
1806              parp->ar_hrd != htons(ARPHRD_IEEE802)) ||
1807             parp->ar_pro != htons(ETH_P_IP) ||
1808             parp->ar_op != htons(ARPOP_REQUEST) ||
1809             parp->ar_hln != dev->addr_len ||
1810             parp->ar_pln != 4)
1811                 goto out;
1812         arpptr = (u8 *)parp + sizeof(struct arphdr);
1813         sha = arpptr;
1814         arpptr += dev->addr_len;        /* sha */
1815         memcpy(&sip, arpptr, sizeof(sip));
1816         arpptr += sizeof(sip);
1817         arpptr += dev->addr_len;        /* tha */
1818         memcpy(&tip, arpptr, sizeof(tip));
1819
1820         if (ipv4_is_loopback(tip) ||
1821             ipv4_is_multicast(tip))
1822                 goto out;
1823
1824         n = neigh_lookup(&arp_tbl, &tip, dev);
1825
1826         if (n) {
1827                 struct vxlan_fdb *f;
1828                 struct sk_buff  *reply;
1829
1830                 if (!(n->nud_state & NUD_CONNECTED)) {
1831                         neigh_release(n);
1832                         goto out;
1833                 }
1834
1835                 f = vxlan_find_mac(vxlan, n->ha, vni);
1836                 if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) {
1837                         /* bridge-local neighbor */
1838                         neigh_release(n);
1839                         goto out;
1840                 }
1841
1842                 reply = arp_create(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha,
1843                                 n->ha, sha);
1844
1845                 neigh_release(n);
1846
1847                 if (reply == NULL)
1848                         goto out;
1849
1850                 skb_reset_mac_header(reply);
1851                 __skb_pull(reply, skb_network_offset(reply));
1852                 reply->ip_summed = CHECKSUM_UNNECESSARY;
1853                 reply->pkt_type = PACKET_HOST;
1854
1855                 if (netif_rx_ni(reply) == NET_RX_DROP)
1856                         dev->stats.rx_dropped++;
1857         } else if (vxlan->cfg.flags & VXLAN_F_L3MISS) {
1858                 union vxlan_addr ipa = {
1859                         .sin.sin_addr.s_addr = tip,
1860                         .sin.sin_family = AF_INET,
1861                 };
1862
1863                 vxlan_ip_miss(dev, &ipa);
1864         }
1865 out:
1866         consume_skb(skb);
1867         return NETDEV_TX_OK;
1868 }
1869
1870 #if IS_ENABLED(CONFIG_IPV6)
1871 static struct sk_buff *vxlan_na_create(struct sk_buff *request,
1872         struct neighbour *n, bool isrouter)
1873 {
1874         struct net_device *dev = request->dev;
1875         struct sk_buff *reply;
1876         struct nd_msg *ns, *na;
1877         struct ipv6hdr *pip6;
1878         u8 *daddr;
1879         int na_olen = 8; /* opt hdr + ETH_ALEN for target */
1880         int ns_olen;
1881         int i, len;
1882
1883         if (dev == NULL || !pskb_may_pull(request, request->len))
1884                 return NULL;
1885
1886         len = LL_RESERVED_SPACE(dev) + sizeof(struct ipv6hdr) +
1887                 sizeof(*na) + na_olen + dev->needed_tailroom;
1888         reply = alloc_skb(len, GFP_ATOMIC);
1889         if (reply == NULL)
1890                 return NULL;
1891
1892         reply->protocol = htons(ETH_P_IPV6);
1893         reply->dev = dev;
1894         skb_reserve(reply, LL_RESERVED_SPACE(request->dev));
1895         skb_push(reply, sizeof(struct ethhdr));
1896         skb_reset_mac_header(reply);
1897
1898         ns = (struct nd_msg *)(ipv6_hdr(request) + 1);
1899
1900         daddr = eth_hdr(request)->h_source;
1901         ns_olen = request->len - skb_network_offset(request) -
1902                 sizeof(struct ipv6hdr) - sizeof(*ns);
1903         for (i = 0; i < ns_olen-1; i += (ns->opt[i+1]<<3)) {
1904                 if (ns->opt[i] == ND_OPT_SOURCE_LL_ADDR) {
1905                         daddr = ns->opt + i + sizeof(struct nd_opt_hdr);
1906                         break;
1907                 }
1908         }
1909
1910         /* Ethernet header */
1911         ether_addr_copy(eth_hdr(reply)->h_dest, daddr);
1912         ether_addr_copy(eth_hdr(reply)->h_source, n->ha);
1913         eth_hdr(reply)->h_proto = htons(ETH_P_IPV6);
1914         reply->protocol = htons(ETH_P_IPV6);
1915
1916         skb_pull(reply, sizeof(struct ethhdr));
1917         skb_reset_network_header(reply);
1918         skb_put(reply, sizeof(struct ipv6hdr));
1919
1920         /* IPv6 header */
1921
1922         pip6 = ipv6_hdr(reply);
1923         memset(pip6, 0, sizeof(struct ipv6hdr));
1924         pip6->version = 6;
1925         pip6->priority = ipv6_hdr(request)->priority;
1926         pip6->nexthdr = IPPROTO_ICMPV6;
1927         pip6->hop_limit = 255;
1928         pip6->daddr = ipv6_hdr(request)->saddr;
1929         pip6->saddr = *(struct in6_addr *)n->primary_key;
1930
1931         skb_pull(reply, sizeof(struct ipv6hdr));
1932         skb_reset_transport_header(reply);
1933
1934         /* Neighbor Advertisement */
1935         na = skb_put_zero(reply, sizeof(*na) + na_olen);
1936         na->icmph.icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT;
1937         na->icmph.icmp6_router = isrouter;
1938         na->icmph.icmp6_override = 1;
1939         na->icmph.icmp6_solicited = 1;
1940         na->target = ns->target;
1941         ether_addr_copy(&na->opt[2], n->ha);
1942         na->opt[0] = ND_OPT_TARGET_LL_ADDR;
1943         na->opt[1] = na_olen >> 3;
1944
1945         na->icmph.icmp6_cksum = csum_ipv6_magic(&pip6->saddr,
1946                 &pip6->daddr, sizeof(*na)+na_olen, IPPROTO_ICMPV6,
1947                 csum_partial(na, sizeof(*na)+na_olen, 0));
1948
1949         pip6->payload_len = htons(sizeof(*na)+na_olen);
1950
1951         skb_push(reply, sizeof(struct ipv6hdr));
1952
1953         reply->ip_summed = CHECKSUM_UNNECESSARY;
1954
1955         return reply;
1956 }
1957
1958 static int neigh_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni)
1959 {
1960         struct vxlan_dev *vxlan = netdev_priv(dev);
1961         const struct in6_addr *daddr;
1962         const struct ipv6hdr *iphdr;
1963         struct inet6_dev *in6_dev;
1964         struct neighbour *n;
1965         struct nd_msg *msg;
1966
1967         in6_dev = __in6_dev_get(dev);
1968         if (!in6_dev)
1969                 goto out;
1970
1971         iphdr = ipv6_hdr(skb);
1972         daddr = &iphdr->daddr;
1973         msg = (struct nd_msg *)(iphdr + 1);
1974
1975         if (ipv6_addr_loopback(daddr) ||
1976             ipv6_addr_is_multicast(&msg->target))
1977                 goto out;
1978
1979         n = neigh_lookup(ipv6_stub->nd_tbl, &msg->target, dev);
1980
1981         if (n) {
1982                 struct vxlan_fdb *f;
1983                 struct sk_buff *reply;
1984
1985                 if (!(n->nud_state & NUD_CONNECTED)) {
1986                         neigh_release(n);
1987                         goto out;
1988                 }
1989
1990                 f = vxlan_find_mac(vxlan, n->ha, vni);
1991                 if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) {
1992                         /* bridge-local neighbor */
1993                         neigh_release(n);
1994                         goto out;
1995                 }
1996
1997                 reply = vxlan_na_create(skb, n,
1998                                         !!(f ? f->flags & NTF_ROUTER : 0));
1999
2000                 neigh_release(n);
2001
2002                 if (reply == NULL)
2003                         goto out;
2004
2005                 if (netif_rx_ni(reply) == NET_RX_DROP)
2006                         dev->stats.rx_dropped++;
2007
2008         } else if (vxlan->cfg.flags & VXLAN_F_L3MISS) {
2009                 union vxlan_addr ipa = {
2010                         .sin6.sin6_addr = msg->target,
2011                         .sin6.sin6_family = AF_INET6,
2012                 };
2013
2014                 vxlan_ip_miss(dev, &ipa);
2015         }
2016
2017 out:
2018         consume_skb(skb);
2019         return NETDEV_TX_OK;
2020 }
2021 #endif
2022
2023 static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb)
2024 {
2025         struct vxlan_dev *vxlan = netdev_priv(dev);
2026         struct neighbour *n;
2027
2028         if (is_multicast_ether_addr(eth_hdr(skb)->h_dest))
2029                 return false;
2030
2031         n = NULL;
2032         switch (ntohs(eth_hdr(skb)->h_proto)) {
2033         case ETH_P_IP:
2034         {
2035                 struct iphdr *pip;
2036
2037                 if (!pskb_may_pull(skb, sizeof(struct iphdr)))
2038                         return false;
2039                 pip = ip_hdr(skb);
2040                 n = neigh_lookup(&arp_tbl, &pip->daddr, dev);
2041                 if (!n && (vxlan->cfg.flags & VXLAN_F_L3MISS)) {
2042                         union vxlan_addr ipa = {
2043                                 .sin.sin_addr.s_addr = pip->daddr,
2044                                 .sin.sin_family = AF_INET,
2045                         };
2046
2047                         vxlan_ip_miss(dev, &ipa);
2048                         return false;
2049                 }
2050
2051                 break;
2052         }
2053 #if IS_ENABLED(CONFIG_IPV6)
2054         case ETH_P_IPV6:
2055         {
2056                 struct ipv6hdr *pip6;
2057
2058                 if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
2059                         return false;
2060                 pip6 = ipv6_hdr(skb);
2061                 n = neigh_lookup(ipv6_stub->nd_tbl, &pip6->daddr, dev);
2062                 if (!n && (vxlan->cfg.flags & VXLAN_F_L3MISS)) {
2063                         union vxlan_addr ipa = {
2064                                 .sin6.sin6_addr = pip6->daddr,
2065                                 .sin6.sin6_family = AF_INET6,
2066                         };
2067
2068                         vxlan_ip_miss(dev, &ipa);
2069                         return false;
2070                 }
2071
2072                 break;
2073         }
2074 #endif
2075         default:
2076                 return false;
2077         }
2078
2079         if (n) {
2080                 bool diff;
2081
2082                 diff = !ether_addr_equal(eth_hdr(skb)->h_dest, n->ha);
2083                 if (diff) {
2084                         memcpy(eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
2085                                 dev->addr_len);
2086                         memcpy(eth_hdr(skb)->h_dest, n->ha, dev->addr_len);
2087                 }
2088                 neigh_release(n);
2089                 return diff;
2090         }
2091
2092         return false;
2093 }
2094
2095 static void vxlan_build_gbp_hdr(struct vxlanhdr *vxh, u32 vxflags,
2096                                 struct vxlan_metadata *md)
2097 {
2098         struct vxlanhdr_gbp *gbp;
2099
2100         if (!md->gbp)
2101                 return;
2102
2103         gbp = (struct vxlanhdr_gbp *)vxh;
2104         vxh->vx_flags |= VXLAN_HF_GBP;
2105
2106         if (md->gbp & VXLAN_GBP_DONT_LEARN)
2107                 gbp->dont_learn = 1;
2108
2109         if (md->gbp & VXLAN_GBP_POLICY_APPLIED)
2110                 gbp->policy_applied = 1;
2111
2112         gbp->policy_id = htons(md->gbp & VXLAN_GBP_ID_MASK);
2113 }
2114
2115 static int vxlan_build_gpe_hdr(struct vxlanhdr *vxh, u32 vxflags,
2116                                __be16 protocol)
2117 {
2118         struct vxlanhdr_gpe *gpe = (struct vxlanhdr_gpe *)vxh;
2119
2120         gpe->np_applied = 1;
2121         gpe->next_protocol = tun_p_from_eth_p(protocol);
2122         if (!gpe->next_protocol)
2123                 return -EPFNOSUPPORT;
2124         return 0;
2125 }
2126
2127 static int vxlan_build_skb(struct sk_buff *skb, struct dst_entry *dst,
2128                            int iphdr_len, __be32 vni,
2129                            struct vxlan_metadata *md, u32 vxflags,
2130                            bool udp_sum)
2131 {
2132         struct vxlanhdr *vxh;
2133         int min_headroom;
2134         int err;
2135         int type = udp_sum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;
2136         __be16 inner_protocol = htons(ETH_P_TEB);
2137
2138         if ((vxflags & VXLAN_F_REMCSUM_TX) &&
2139             skb->ip_summed == CHECKSUM_PARTIAL) {
2140                 int csum_start = skb_checksum_start_offset(skb);
2141
2142                 if (csum_start <= VXLAN_MAX_REMCSUM_START &&
2143                     !(csum_start & VXLAN_RCO_SHIFT_MASK) &&
2144                     (skb->csum_offset == offsetof(struct udphdr, check) ||
2145                      skb->csum_offset == offsetof(struct tcphdr, check)))
2146                         type |= SKB_GSO_TUNNEL_REMCSUM;
2147         }
2148
2149         min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len
2150                         + VXLAN_HLEN + iphdr_len;
2151
2152         /* Need space for new headers (invalidates iph ptr) */
2153         err = skb_cow_head(skb, min_headroom);
2154         if (unlikely(err))
2155                 return err;
2156
2157         err = iptunnel_handle_offloads(skb, type);
2158         if (err)
2159                 return err;
2160
2161         vxh = __skb_push(skb, sizeof(*vxh));
2162         vxh->vx_flags = VXLAN_HF_VNI;
2163         vxh->vx_vni = vxlan_vni_field(vni);
2164
2165         if (type & SKB_GSO_TUNNEL_REMCSUM) {
2166                 unsigned int start;
2167
2168                 start = skb_checksum_start_offset(skb) - sizeof(struct vxlanhdr);
2169                 vxh->vx_vni |= vxlan_compute_rco(start, skb->csum_offset);
2170                 vxh->vx_flags |= VXLAN_HF_RCO;
2171
2172                 if (!skb_is_gso(skb)) {
2173                         skb->ip_summed = CHECKSUM_NONE;
2174                         skb->encapsulation = 0;
2175                 }
2176         }
2177
2178         if (vxflags & VXLAN_F_GBP)
2179                 vxlan_build_gbp_hdr(vxh, vxflags, md);
2180         if (vxflags & VXLAN_F_GPE) {
2181                 err = vxlan_build_gpe_hdr(vxh, vxflags, skb->protocol);
2182                 if (err < 0)
2183                         return err;
2184                 inner_protocol = skb->protocol;
2185         }
2186
2187         skb_set_inner_protocol(skb, inner_protocol);
2188         return 0;
2189 }
2190
2191 static struct rtable *vxlan_get_route(struct vxlan_dev *vxlan, struct net_device *dev,
2192                                       struct vxlan_sock *sock4,
2193                                       struct sk_buff *skb, int oif, u8 tos,
2194                                       __be32 daddr, __be32 *saddr, __be16 dport, __be16 sport,
2195                                       struct dst_cache *dst_cache,
2196                                       const struct ip_tunnel_info *info)
2197 {
2198         bool use_cache = ip_tunnel_dst_cache_usable(skb, info);
2199         struct rtable *rt = NULL;
2200         struct flowi4 fl4;
2201
2202         if (!sock4)
2203                 return ERR_PTR(-EIO);
2204
2205         if (tos && !info)
2206                 use_cache = false;
2207         if (use_cache) {
2208                 rt = dst_cache_get_ip4(dst_cache, saddr);
2209                 if (rt)
2210                         return rt;
2211         }
2212
2213         memset(&fl4, 0, sizeof(fl4));
2214         fl4.flowi4_oif = oif;
2215         fl4.flowi4_tos = RT_TOS(tos);
2216         fl4.flowi4_mark = skb->mark;
2217         fl4.flowi4_proto = IPPROTO_UDP;
2218         fl4.daddr = daddr;
2219         fl4.saddr = *saddr;
2220         fl4.fl4_dport = dport;
2221         fl4.fl4_sport = sport;
2222
2223         rt = ip_route_output_key(vxlan->net, &fl4);
2224         if (likely(!IS_ERR(rt))) {
2225                 if (rt->dst.dev == dev) {
2226                         netdev_dbg(dev, "circular route to %pI4\n", &daddr);
2227                         ip_rt_put(rt);
2228                         return ERR_PTR(-ELOOP);
2229                 }
2230
2231                 *saddr = fl4.saddr;
2232                 if (use_cache)
2233                         dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
2234         } else {
2235                 netdev_dbg(dev, "no route to %pI4\n", &daddr);
2236                 return ERR_PTR(-ENETUNREACH);
2237         }
2238         return rt;
2239 }
2240
2241 #if IS_ENABLED(CONFIG_IPV6)
2242 static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan,
2243                                           struct net_device *dev,
2244                                           struct vxlan_sock *sock6,
2245                                           struct sk_buff *skb, int oif, u8 tos,
2246                                           __be32 label,
2247                                           const struct in6_addr *daddr,
2248                                           struct in6_addr *saddr,
2249                                           __be16 dport, __be16 sport,
2250                                           struct dst_cache *dst_cache,
2251                                           const struct ip_tunnel_info *info)
2252 {
2253         bool use_cache = ip_tunnel_dst_cache_usable(skb, info);
2254         struct dst_entry *ndst;
2255         struct flowi6 fl6;
2256         int err;
2257
2258         if (!sock6)
2259                 return ERR_PTR(-EIO);
2260
2261         if (tos && !info)
2262                 use_cache = false;
2263         if (use_cache) {
2264                 ndst = dst_cache_get_ip6(dst_cache, saddr);
2265                 if (ndst)
2266                         return ndst;
2267         }
2268
2269         memset(&fl6, 0, sizeof(fl6));
2270         fl6.flowi6_oif = oif;
2271         fl6.daddr = *daddr;
2272         fl6.saddr = *saddr;
2273         fl6.flowlabel = ip6_make_flowinfo(RT_TOS(tos), label);
2274         fl6.flowi6_mark = skb->mark;
2275         fl6.flowi6_proto = IPPROTO_UDP;
2276         fl6.fl6_dport = dport;
2277         fl6.fl6_sport = sport;
2278
2279         err = ipv6_stub->ipv6_dst_lookup(vxlan->net,
2280                                          sock6->sock->sk,
2281                                          &ndst, &fl6);
2282         if (unlikely(err < 0)) {
2283                 netdev_dbg(dev, "no route to %pI6\n", daddr);
2284                 return ERR_PTR(-ENETUNREACH);
2285         }
2286
2287         if (unlikely(ndst->dev == dev)) {
2288                 netdev_dbg(dev, "circular route to %pI6\n", daddr);
2289                 dst_release(ndst);
2290                 return ERR_PTR(-ELOOP);
2291         }
2292
2293         *saddr = fl6.saddr;
2294         if (use_cache)
2295                 dst_cache_set_ip6(dst_cache, ndst, saddr);
2296         return ndst;
2297 }
2298 #endif
2299
2300 /* Bypass encapsulation if the destination is local */
2301 static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan,
2302                                struct vxlan_dev *dst_vxlan, __be32 vni)
2303 {
2304         struct pcpu_sw_netstats *tx_stats, *rx_stats;
2305         union vxlan_addr loopback;
2306         union vxlan_addr *remote_ip = &dst_vxlan->default_dst.remote_ip;
2307         struct net_device *dev;
2308         int len = skb->len;
2309
2310         tx_stats = this_cpu_ptr(src_vxlan->dev->tstats);
2311         rx_stats = this_cpu_ptr(dst_vxlan->dev->tstats);
2312         skb->pkt_type = PACKET_HOST;
2313         skb->encapsulation = 0;
2314         skb->dev = dst_vxlan->dev;
2315         __skb_pull(skb, skb_network_offset(skb));
2316
2317         if (remote_ip->sa.sa_family == AF_INET) {
2318                 loopback.sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
2319                 loopback.sa.sa_family =  AF_INET;
2320 #if IS_ENABLED(CONFIG_IPV6)
2321         } else {
2322                 loopback.sin6.sin6_addr = in6addr_loopback;
2323                 loopback.sa.sa_family =  AF_INET6;
2324 #endif
2325         }
2326
2327         rcu_read_lock();
2328         dev = skb->dev;
2329         if (unlikely(!(dev->flags & IFF_UP))) {
2330                 kfree_skb(skb);
2331                 goto drop;
2332         }
2333
2334         if (dst_vxlan->cfg.flags & VXLAN_F_LEARN)
2335                 vxlan_snoop(dev, &loopback, eth_hdr(skb)->h_source, 0, vni);
2336
2337         u64_stats_update_begin(&tx_stats->syncp);
2338         tx_stats->tx_packets++;
2339         tx_stats->tx_bytes += len;
2340         u64_stats_update_end(&tx_stats->syncp);
2341
2342         if (netif_rx(skb) == NET_RX_SUCCESS) {
2343                 u64_stats_update_begin(&rx_stats->syncp);
2344                 rx_stats->rx_packets++;
2345                 rx_stats->rx_bytes += len;
2346                 u64_stats_update_end(&rx_stats->syncp);
2347         } else {
2348 drop:
2349                 dev->stats.rx_dropped++;
2350         }
2351         rcu_read_unlock();
2352 }
2353
2354 static int encap_bypass_if_local(struct sk_buff *skb, struct net_device *dev,
2355                                  struct vxlan_dev *vxlan,
2356                                  union vxlan_addr *daddr,
2357                                  __be16 dst_port, int dst_ifindex, __be32 vni,
2358                                  struct dst_entry *dst,
2359                                  u32 rt_flags)
2360 {
2361 #if IS_ENABLED(CONFIG_IPV6)
2362         /* IPv6 rt-flags are checked against RTF_LOCAL, but the value of
2363          * RTF_LOCAL is equal to RTCF_LOCAL. So to keep code simple
2364          * we can use RTCF_LOCAL which works for ipv4 and ipv6 route entry.
2365          */
2366         BUILD_BUG_ON(RTCF_LOCAL != RTF_LOCAL);
2367 #endif
2368         /* Bypass encapsulation if the destination is local */
2369         if (rt_flags & RTCF_LOCAL &&
2370             !(rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) {
2371                 struct vxlan_dev *dst_vxlan;
2372
2373                 dst_release(dst);
2374                 dst_vxlan = vxlan_find_vni(vxlan->net, dst_ifindex, vni,
2375                                            daddr->sa.sa_family, dst_port,
2376                                            vxlan->cfg.flags);
2377                 if (!dst_vxlan) {
2378                         dev->stats.tx_errors++;
2379                         kfree_skb(skb);
2380
2381                         return -ENOENT;
2382                 }
2383                 vxlan_encap_bypass(skb, vxlan, dst_vxlan, vni);
2384                 return 1;
2385         }
2386
2387         return 0;
2388 }
2389
2390 static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
2391                            __be32 default_vni, struct vxlan_rdst *rdst,
2392                            bool did_rsc)
2393 {
2394         struct dst_cache *dst_cache;
2395         struct ip_tunnel_info *info;
2396         struct vxlan_dev *vxlan = netdev_priv(dev);
2397         const struct iphdr *old_iph = ip_hdr(skb);
2398         union vxlan_addr *dst;
2399         union vxlan_addr remote_ip, local_ip;
2400         struct vxlan_metadata _md;
2401         struct vxlan_metadata *md = &_md;
2402         __be16 src_port = 0, dst_port;
2403         struct dst_entry *ndst = NULL;
2404         __be32 vni, label;
2405         __u8 tos, ttl;
2406         int ifindex;
2407         int err;
2408         u32 flags = vxlan->cfg.flags;
2409         bool udp_sum = false;
2410         bool xnet = !net_eq(vxlan->net, dev_net(vxlan->dev));
2411
2412         info = skb_tunnel_info(skb);
2413
2414         if (rdst) {
2415                 dst = &rdst->remote_ip;
2416                 if (vxlan_addr_any(dst)) {
2417                         if (did_rsc) {
2418                                 /* short-circuited back to local bridge */
2419                                 vxlan_encap_bypass(skb, vxlan, vxlan, default_vni);
2420                                 return;
2421                         }
2422                         goto drop;
2423                 }
2424
2425                 dst_port = rdst->remote_port ? rdst->remote_port : vxlan->cfg.dst_port;
2426                 vni = (rdst->remote_vni) ? : default_vni;
2427                 ifindex = rdst->remote_ifindex;
2428                 local_ip = vxlan->cfg.saddr;
2429                 dst_cache = &rdst->dst_cache;
2430                 md->gbp = skb->mark;
2431                 if (flags & VXLAN_F_TTL_INHERIT) {
2432                         ttl = ip_tunnel_get_ttl(old_iph, skb);
2433                 } else {
2434                         ttl = vxlan->cfg.ttl;
2435                         if (!ttl && vxlan_addr_multicast(dst))
2436                                 ttl = 1;
2437                 }
2438
2439                 tos = vxlan->cfg.tos;
2440                 if (tos == 1)
2441                         tos = ip_tunnel_get_dsfield(old_iph, skb);
2442
2443                 if (dst->sa.sa_family == AF_INET)
2444                         udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM_TX);
2445                 else
2446                         udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM6_TX);
2447                 label = vxlan->cfg.label;
2448         } else {
2449                 if (!info) {
2450                         WARN_ONCE(1, "%s: Missing encapsulation instructions\n",
2451                                   dev->name);
2452                         goto drop;
2453                 }
2454                 remote_ip.sa.sa_family = ip_tunnel_info_af(info);
2455                 if (remote_ip.sa.sa_family == AF_INET) {
2456                         remote_ip.sin.sin_addr.s_addr = info->key.u.ipv4.dst;
2457                         local_ip.sin.sin_addr.s_addr = info->key.u.ipv4.src;
2458                 } else {
2459                         remote_ip.sin6.sin6_addr = info->key.u.ipv6.dst;
2460                         local_ip.sin6.sin6_addr = info->key.u.ipv6.src;
2461                 }
2462                 dst = &remote_ip;
2463                 dst_port = info->key.tp_dst ? : vxlan->cfg.dst_port;
2464                 vni = tunnel_id_to_key32(info->key.tun_id);
2465                 ifindex = 0;
2466                 dst_cache = &info->dst_cache;
2467                 if (info->options_len &&
2468                     info->key.tun_flags & TUNNEL_VXLAN_OPT)
2469                         md = ip_tunnel_info_opts(info);
2470                 ttl = info->key.ttl;
2471                 tos = info->key.tos;
2472                 label = info->key.label;
2473                 udp_sum = !!(info->key.tun_flags & TUNNEL_CSUM);
2474         }
2475         src_port = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min,
2476                                      vxlan->cfg.port_max, true);
2477
2478         rcu_read_lock();
2479         if (dst->sa.sa_family == AF_INET) {
2480                 struct vxlan_sock *sock4 = rcu_dereference(vxlan->vn4_sock);
2481                 struct rtable *rt;
2482                 __be16 df = 0;
2483
2484                 if (!ifindex)
2485                         ifindex = sock4->sock->sk->sk_bound_dev_if;
2486
2487                 rt = vxlan_get_route(vxlan, dev, sock4, skb, ifindex, tos,
2488                                      dst->sin.sin_addr.s_addr,
2489                                      &local_ip.sin.sin_addr.s_addr,
2490                                      dst_port, src_port,
2491                                      dst_cache, info);
2492                 if (IS_ERR(rt)) {
2493                         err = PTR_ERR(rt);
2494                         goto tx_error;
2495                 }
2496
2497                 if (!info) {
2498                         /* Bypass encapsulation if the destination is local */
2499                         err = encap_bypass_if_local(skb, dev, vxlan, dst,
2500                                                     dst_port, ifindex, vni,
2501                                                     &rt->dst, rt->rt_flags);
2502                         if (err)
2503                                 goto out_unlock;
2504
2505                         if (vxlan->cfg.df == VXLAN_DF_SET) {
2506                                 df = htons(IP_DF);
2507                         } else if (vxlan->cfg.df == VXLAN_DF_INHERIT) {
2508                                 struct ethhdr *eth = eth_hdr(skb);
2509
2510                                 if (ntohs(eth->h_proto) == ETH_P_IPV6 ||
2511                                     (ntohs(eth->h_proto) == ETH_P_IP &&
2512                                      old_iph->frag_off & htons(IP_DF)))
2513                                         df = htons(IP_DF);
2514                         }
2515                 } else if (info->key.tun_flags & TUNNEL_DONT_FRAGMENT) {
2516                         df = htons(IP_DF);
2517                 }
2518
2519                 ndst = &rt->dst;
2520                 skb_tunnel_check_pmtu(skb, ndst, VXLAN_HEADROOM);
2521
2522                 tos = ip_tunnel_ecn_encap(tos, old_iph, skb);
2523                 ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
2524                 err = vxlan_build_skb(skb, ndst, sizeof(struct iphdr),
2525                                       vni, md, flags, udp_sum);
2526                 if (err < 0)
2527                         goto tx_error;
2528
2529                 udp_tunnel_xmit_skb(rt, sock4->sock->sk, skb, local_ip.sin.sin_addr.s_addr,
2530                                     dst->sin.sin_addr.s_addr, tos, ttl, df,
2531                                     src_port, dst_port, xnet, !udp_sum);
2532 #if IS_ENABLED(CONFIG_IPV6)
2533         } else {
2534                 struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock);
2535
2536                 if (!ifindex)
2537                         ifindex = sock6->sock->sk->sk_bound_dev_if;
2538
2539                 ndst = vxlan6_get_route(vxlan, dev, sock6, skb, ifindex, tos,
2540                                         label, &dst->sin6.sin6_addr,
2541                                         &local_ip.sin6.sin6_addr,
2542                                         dst_port, src_port,
2543                                         dst_cache, info);
2544                 if (IS_ERR(ndst)) {
2545                         err = PTR_ERR(ndst);
2546                         ndst = NULL;
2547                         goto tx_error;
2548                 }
2549
2550                 if (!info) {
2551                         u32 rt6i_flags = ((struct rt6_info *)ndst)->rt6i_flags;
2552
2553                         err = encap_bypass_if_local(skb, dev, vxlan, dst,
2554                                                     dst_port, ifindex, vni,
2555                                                     ndst, rt6i_flags);
2556                         if (err)
2557                                 goto out_unlock;
2558                 }
2559
2560                 skb_tunnel_check_pmtu(skb, ndst, VXLAN6_HEADROOM);
2561
2562                 tos = ip_tunnel_ecn_encap(tos, old_iph, skb);
2563                 ttl = ttl ? : ip6_dst_hoplimit(ndst);
2564                 skb_scrub_packet(skb, xnet);
2565                 err = vxlan_build_skb(skb, ndst, sizeof(struct ipv6hdr),
2566                                       vni, md, flags, udp_sum);
2567                 if (err < 0)
2568                         goto tx_error;
2569
2570                 udp_tunnel6_xmit_skb(ndst, sock6->sock->sk, skb, dev,
2571                                      &local_ip.sin6.sin6_addr,
2572                                      &dst->sin6.sin6_addr, tos, ttl,
2573                                      label, src_port, dst_port, !udp_sum);
2574 #endif
2575         }
2576 out_unlock:
2577         rcu_read_unlock();
2578         return;
2579
2580 drop:
2581         dev->stats.tx_dropped++;
2582         dev_kfree_skb(skb);
2583         return;
2584
2585 tx_error:
2586         rcu_read_unlock();
2587         if (err == -ELOOP)
2588                 dev->stats.collisions++;
2589         else if (err == -ENETUNREACH)
2590                 dev->stats.tx_carrier_errors++;
2591         dst_release(ndst);
2592         dev->stats.tx_errors++;
2593         kfree_skb(skb);
2594 }
2595
2596 /* Transmit local packets over Vxlan
2597  *
2598  * Outer IP header inherits ECN and DF from inner header.
2599  * Outer UDP destination is the VXLAN assigned port.
2600  *           source port is based on hash of flow
2601  */
2602 static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
2603 {
2604         struct vxlan_dev *vxlan = netdev_priv(dev);
2605         struct vxlan_rdst *rdst, *fdst = NULL;
2606         const struct ip_tunnel_info *info;
2607         bool did_rsc = false;
2608         struct vxlan_fdb *f;
2609         struct ethhdr *eth;
2610         __be32 vni = 0;
2611
2612         info = skb_tunnel_info(skb);
2613
2614         skb_reset_mac_header(skb);
2615
2616         if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) {
2617                 if (info && info->mode & IP_TUNNEL_INFO_BRIDGE &&
2618                     info->mode & IP_TUNNEL_INFO_TX) {
2619                         vni = tunnel_id_to_key32(info->key.tun_id);
2620                 } else {
2621                         if (info && info->mode & IP_TUNNEL_INFO_TX)
2622                                 vxlan_xmit_one(skb, dev, vni, NULL, false);
2623                         else
2624                                 kfree_skb(skb);
2625                         return NETDEV_TX_OK;
2626                 }
2627         }
2628
2629         if (vxlan->cfg.flags & VXLAN_F_PROXY) {
2630                 eth = eth_hdr(skb);
2631                 if (ntohs(eth->h_proto) == ETH_P_ARP)
2632                         return arp_reduce(dev, skb, vni);
2633 #if IS_ENABLED(CONFIG_IPV6)
2634                 else if (ntohs(eth->h_proto) == ETH_P_IPV6 &&
2635                          pskb_may_pull(skb, sizeof(struct ipv6hdr) +
2636                                             sizeof(struct nd_msg)) &&
2637                          ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) {
2638                         struct nd_msg *m = (struct nd_msg *)(ipv6_hdr(skb) + 1);
2639
2640                         if (m->icmph.icmp6_code == 0 &&
2641                             m->icmph.icmp6_type == NDISC_NEIGHBOUR_SOLICITATION)
2642                                 return neigh_reduce(dev, skb, vni);
2643                 }
2644 #endif
2645         }
2646
2647         eth = eth_hdr(skb);
2648         f = vxlan_find_mac(vxlan, eth->h_dest, vni);
2649         did_rsc = false;
2650
2651         if (f && (f->flags & NTF_ROUTER) && (vxlan->cfg.flags & VXLAN_F_RSC) &&
2652             (ntohs(eth->h_proto) == ETH_P_IP ||
2653              ntohs(eth->h_proto) == ETH_P_IPV6)) {
2654                 did_rsc = route_shortcircuit(dev, skb);
2655                 if (did_rsc)
2656                         f = vxlan_find_mac(vxlan, eth->h_dest, vni);
2657         }
2658
2659         if (f == NULL) {
2660                 f = vxlan_find_mac(vxlan, all_zeros_mac, vni);
2661                 if (f == NULL) {
2662                         if ((vxlan->cfg.flags & VXLAN_F_L2MISS) &&
2663                             !is_multicast_ether_addr(eth->h_dest))
2664                                 vxlan_fdb_miss(vxlan, eth->h_dest);
2665
2666                         dev->stats.tx_dropped++;
2667                         kfree_skb(skb);
2668                         return NETDEV_TX_OK;
2669                 }
2670         }
2671
2672         list_for_each_entry_rcu(rdst, &f->remotes, list) {
2673                 struct sk_buff *skb1;
2674
2675                 if (!fdst) {
2676                         fdst = rdst;
2677                         continue;
2678                 }
2679                 skb1 = skb_clone(skb, GFP_ATOMIC);
2680                 if (skb1)
2681                         vxlan_xmit_one(skb1, dev, vni, rdst, did_rsc);
2682         }
2683
2684         if (fdst)
2685                 vxlan_xmit_one(skb, dev, vni, fdst, did_rsc);
2686         else
2687                 kfree_skb(skb);
2688         return NETDEV_TX_OK;
2689 }
2690
2691 /* Walk the forwarding table and purge stale entries */
2692 static void vxlan_cleanup(struct timer_list *t)
2693 {
2694         struct vxlan_dev *vxlan = from_timer(vxlan, t, age_timer);
2695         unsigned long next_timer = jiffies + FDB_AGE_INTERVAL;
2696         unsigned int h;
2697
2698         if (!netif_running(vxlan->dev))
2699                 return;
2700
2701         for (h = 0; h < FDB_HASH_SIZE; ++h) {
2702                 struct hlist_node *p, *n;
2703
2704                 spin_lock(&vxlan->hash_lock);
2705                 hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) {
2706                         struct vxlan_fdb *f
2707                                 = container_of(p, struct vxlan_fdb, hlist);
2708                         unsigned long timeout;
2709
2710                         if (f->state & (NUD_PERMANENT | NUD_NOARP))
2711                                 continue;
2712
2713                         if (f->flags & NTF_EXT_LEARNED)
2714                                 continue;
2715
2716                         timeout = f->used + vxlan->cfg.age_interval * HZ;
2717                         if (time_before_eq(timeout, jiffies)) {
2718                                 netdev_dbg(vxlan->dev,
2719                                            "garbage collect %pM\n",
2720                                            f->eth_addr);
2721                                 f->state = NUD_STALE;
2722                                 vxlan_fdb_destroy(vxlan, f, true, true);
2723                         } else if (time_before(timeout, next_timer))
2724                                 next_timer = timeout;
2725                 }
2726                 spin_unlock(&vxlan->hash_lock);
2727         }
2728
2729         mod_timer(&vxlan->age_timer, next_timer);
2730 }
2731
2732 static void vxlan_vs_del_dev(struct vxlan_dev *vxlan)
2733 {
2734         struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
2735
2736         spin_lock(&vn->sock_lock);
2737         hlist_del_init_rcu(&vxlan->hlist4.hlist);
2738 #if IS_ENABLED(CONFIG_IPV6)
2739         hlist_del_init_rcu(&vxlan->hlist6.hlist);
2740 #endif
2741         spin_unlock(&vn->sock_lock);
2742 }
2743
2744 static void vxlan_vs_add_dev(struct vxlan_sock *vs, struct vxlan_dev *vxlan,
2745                              struct vxlan_dev_node *node)
2746 {
2747         struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
2748         __be32 vni = vxlan->default_dst.remote_vni;
2749
2750         node->vxlan = vxlan;
2751         spin_lock(&vn->sock_lock);
2752         hlist_add_head_rcu(&node->hlist, vni_head(vs, vni));
2753         spin_unlock(&vn->sock_lock);
2754 }
2755
2756 /* Setup stats when device is created */
2757 static int vxlan_init(struct net_device *dev)
2758 {
2759         dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
2760         if (!dev->tstats)
2761                 return -ENOMEM;
2762
2763         return 0;
2764 }
2765
2766 static void vxlan_fdb_delete_default(struct vxlan_dev *vxlan, __be32 vni)
2767 {
2768         struct vxlan_fdb *f;
2769
2770         spin_lock_bh(&vxlan->hash_lock);
2771         f = __vxlan_find_mac(vxlan, all_zeros_mac, vni);
2772         if (f)
2773                 vxlan_fdb_destroy(vxlan, f, true, true);
2774         spin_unlock_bh(&vxlan->hash_lock);
2775 }
2776
2777 static void vxlan_uninit(struct net_device *dev)
2778 {
2779         struct vxlan_dev *vxlan = netdev_priv(dev);
2780
2781         gro_cells_destroy(&vxlan->gro_cells);
2782
2783         vxlan_fdb_delete_default(vxlan, vxlan->cfg.vni);
2784
2785         free_percpu(dev->tstats);
2786 }
2787
2788 /* Start ageing timer and join group when device is brought up */
2789 static int vxlan_open(struct net_device *dev)
2790 {
2791         struct vxlan_dev *vxlan = netdev_priv(dev);
2792         int ret;
2793
2794         ret = vxlan_sock_add(vxlan);
2795         if (ret < 0)
2796                 return ret;
2797
2798         if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip)) {
2799                 ret = vxlan_igmp_join(vxlan);
2800                 if (ret == -EADDRINUSE)
2801                         ret = 0;
2802                 if (ret) {
2803                         vxlan_sock_release(vxlan);
2804                         return ret;
2805                 }
2806         }
2807
2808         if (vxlan->cfg.age_interval)
2809                 mod_timer(&vxlan->age_timer, jiffies + FDB_AGE_INTERVAL);
2810
2811         return ret;
2812 }
2813
2814 /* Purge the forwarding table */
2815 static void vxlan_flush(struct vxlan_dev *vxlan, bool do_all)
2816 {
2817         unsigned int h;
2818
2819         spin_lock_bh(&vxlan->hash_lock);
2820         for (h = 0; h < FDB_HASH_SIZE; ++h) {
2821                 struct hlist_node *p, *n;
2822                 hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) {
2823                         struct vxlan_fdb *f
2824                                 = container_of(p, struct vxlan_fdb, hlist);
2825                         if (!do_all && (f->state & (NUD_PERMANENT | NUD_NOARP)))
2826                                 continue;
2827                         /* the all_zeros_mac entry is deleted at vxlan_uninit */
2828                         if (!is_zero_ether_addr(f->eth_addr))
2829                                 vxlan_fdb_destroy(vxlan, f, true, true);
2830                 }
2831         }
2832         spin_unlock_bh(&vxlan->hash_lock);
2833 }
2834
2835 /* Cleanup timer and forwarding table on shutdown */
2836 static int vxlan_stop(struct net_device *dev)
2837 {
2838         struct vxlan_dev *vxlan = netdev_priv(dev);
2839         struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
2840         int ret = 0;
2841
2842         if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip) &&
2843             !vxlan_group_used(vn, vxlan))
2844                 ret = vxlan_igmp_leave(vxlan);
2845
2846         del_timer_sync(&vxlan->age_timer);
2847
2848         vxlan_flush(vxlan, false);
2849         vxlan_sock_release(vxlan);
2850
2851         return ret;
2852 }
2853
2854 /* Stub, nothing needs to be done. */
2855 static void vxlan_set_multicast_list(struct net_device *dev)
2856 {
2857 }
2858
2859 static int vxlan_change_mtu(struct net_device *dev, int new_mtu)
2860 {
2861         struct vxlan_dev *vxlan = netdev_priv(dev);
2862         struct vxlan_rdst *dst = &vxlan->default_dst;
2863         struct net_device *lowerdev = __dev_get_by_index(vxlan->net,
2864                                                          dst->remote_ifindex);
2865         bool use_ipv6 = !!(vxlan->cfg.flags & VXLAN_F_IPV6);
2866
2867         /* This check is different than dev->max_mtu, because it looks at
2868          * the lowerdev->mtu, rather than the static dev->max_mtu
2869          */
2870         if (lowerdev) {
2871                 int max_mtu = lowerdev->mtu -
2872                               (use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM);
2873                 if (new_mtu > max_mtu)
2874                         return -EINVAL;
2875         }
2876
2877         dev->mtu = new_mtu;
2878         return 0;
2879 }
2880
2881 static int vxlan_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
2882 {
2883         struct vxlan_dev *vxlan = netdev_priv(dev);
2884         struct ip_tunnel_info *info = skb_tunnel_info(skb);
2885         __be16 sport, dport;
2886
2887         sport = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min,
2888                                   vxlan->cfg.port_max, true);
2889         dport = info->key.tp_dst ? : vxlan->cfg.dst_port;
2890
2891         if (ip_tunnel_info_af(info) == AF_INET) {
2892                 struct vxlan_sock *sock4 = rcu_dereference(vxlan->vn4_sock);
2893                 struct rtable *rt;
2894
2895                 rt = vxlan_get_route(vxlan, dev, sock4, skb, 0, info->key.tos,
2896                                      info->key.u.ipv4.dst,
2897                                      &info->key.u.ipv4.src, dport, sport,
2898                                      &info->dst_cache, info);
2899                 if (IS_ERR(rt))
2900                         return PTR_ERR(rt);
2901                 ip_rt_put(rt);
2902         } else {
2903 #if IS_ENABLED(CONFIG_IPV6)
2904                 struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock);
2905                 struct dst_entry *ndst;
2906
2907                 ndst = vxlan6_get_route(vxlan, dev, sock6, skb, 0, info->key.tos,
2908                                         info->key.label, &info->key.u.ipv6.dst,
2909                                         &info->key.u.ipv6.src, dport, sport,
2910                                         &info->dst_cache, info);
2911                 if (IS_ERR(ndst))
2912                         return PTR_ERR(ndst);
2913                 dst_release(ndst);
2914 #else /* !CONFIG_IPV6 */
2915                 return -EPFNOSUPPORT;
2916 #endif
2917         }
2918         info->key.tp_src = sport;
2919         info->key.tp_dst = dport;
2920         return 0;
2921 }
2922
2923 static const struct net_device_ops vxlan_netdev_ether_ops = {
2924         .ndo_init               = vxlan_init,
2925         .ndo_uninit             = vxlan_uninit,
2926         .ndo_open               = vxlan_open,
2927         .ndo_stop               = vxlan_stop,
2928         .ndo_start_xmit         = vxlan_xmit,
2929         .ndo_get_stats64        = ip_tunnel_get_stats64,
2930         .ndo_set_rx_mode        = vxlan_set_multicast_list,
2931         .ndo_change_mtu         = vxlan_change_mtu,
2932         .ndo_validate_addr      = eth_validate_addr,
2933         .ndo_set_mac_address    = eth_mac_addr,
2934         .ndo_fdb_add            = vxlan_fdb_add,
2935         .ndo_fdb_del            = vxlan_fdb_delete,
2936         .ndo_fdb_dump           = vxlan_fdb_dump,
2937         .ndo_fdb_get            = vxlan_fdb_get,
2938         .ndo_fill_metadata_dst  = vxlan_fill_metadata_dst,
2939         .ndo_change_proto_down  = dev_change_proto_down_generic,
2940 };
2941
2942 static const struct net_device_ops vxlan_netdev_raw_ops = {
2943         .ndo_init               = vxlan_init,
2944         .ndo_uninit             = vxlan_uninit,
2945         .ndo_open               = vxlan_open,
2946         .ndo_stop               = vxlan_stop,
2947         .ndo_start_xmit         = vxlan_xmit,
2948         .ndo_get_stats64        = ip_tunnel_get_stats64,
2949         .ndo_change_mtu         = vxlan_change_mtu,
2950         .ndo_fill_metadata_dst  = vxlan_fill_metadata_dst,
2951 };
2952
2953 /* Info for udev, that this is a virtual tunnel endpoint */
2954 static struct device_type vxlan_type = {
2955         .name = "vxlan",
2956 };
2957
2958 /* Calls the ndo_udp_tunnel_add of the caller in order to
2959  * supply the listening VXLAN udp ports. Callers are expected
2960  * to implement the ndo_udp_tunnel_add.
2961  */
2962 static void vxlan_offload_rx_ports(struct net_device *dev, bool push)
2963 {
2964         struct vxlan_sock *vs;
2965         struct net *net = dev_net(dev);
2966         struct vxlan_net *vn = net_generic(net, vxlan_net_id);
2967         unsigned int i;
2968
2969         spin_lock(&vn->sock_lock);
2970         for (i = 0; i < PORT_HASH_SIZE; ++i) {
2971                 hlist_for_each_entry_rcu(vs, &vn->sock_list[i], hlist) {
2972                         unsigned short type;
2973
2974                         if (vs->flags & VXLAN_F_GPE)
2975                                 type = UDP_TUNNEL_TYPE_VXLAN_GPE;
2976                         else
2977                                 type = UDP_TUNNEL_TYPE_VXLAN;
2978
2979                         if (push)
2980                                 udp_tunnel_push_rx_port(dev, vs->sock, type);
2981                         else
2982                                 udp_tunnel_drop_rx_port(dev, vs->sock, type);
2983                 }
2984         }
2985         spin_unlock(&vn->sock_lock);
2986 }
2987
2988 /* Initialize the device structure. */
2989 static void vxlan_setup(struct net_device *dev)
2990 {
2991         struct vxlan_dev *vxlan = netdev_priv(dev);
2992         unsigned int h;
2993
2994         eth_hw_addr_random(dev);
2995         ether_setup(dev);
2996
2997         dev->needs_free_netdev = true;
2998         SET_NETDEV_DEVTYPE(dev, &vxlan_type);
2999
3000         dev->features   |= NETIF_F_LLTX;
3001         dev->features   |= NETIF_F_SG | NETIF_F_HW_CSUM;
3002         dev->features   |= NETIF_F_RXCSUM;
3003         dev->features   |= NETIF_F_GSO_SOFTWARE;
3004
3005         dev->vlan_features = dev->features;
3006         dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM;
3007         dev->hw_features |= NETIF_F_GSO_SOFTWARE;
3008         netif_keep_dst(dev);
3009         dev->priv_flags |= IFF_NO_QUEUE;
3010
3011         /* MTU range: 68 - 65535 */
3012         dev->min_mtu = ETH_MIN_MTU;
3013         dev->max_mtu = ETH_MAX_MTU;
3014
3015         INIT_LIST_HEAD(&vxlan->next);
3016         spin_lock_init(&vxlan->hash_lock);
3017
3018         timer_setup(&vxlan->age_timer, vxlan_cleanup, TIMER_DEFERRABLE);
3019
3020         vxlan->dev = dev;
3021
3022         gro_cells_init(&vxlan->gro_cells, dev);
3023
3024         for (h = 0; h < FDB_HASH_SIZE; ++h)
3025                 INIT_HLIST_HEAD(&vxlan->fdb_head[h]);
3026 }
3027
3028 static void vxlan_ether_setup(struct net_device *dev)
3029 {
3030         dev->priv_flags &= ~IFF_TX_SKB_SHARING;
3031         dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
3032         dev->netdev_ops = &vxlan_netdev_ether_ops;
3033 }
3034
3035 static void vxlan_raw_setup(struct net_device *dev)
3036 {
3037         dev->header_ops = NULL;
3038         dev->type = ARPHRD_NONE;
3039         dev->hard_header_len = 0;
3040         dev->addr_len = 0;
3041         dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST;
3042         dev->netdev_ops = &vxlan_netdev_raw_ops;
3043 }
3044
3045 static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
3046         [IFLA_VXLAN_ID]         = { .type = NLA_U32 },
3047         [IFLA_VXLAN_GROUP]      = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
3048         [IFLA_VXLAN_GROUP6]     = { .len = sizeof(struct in6_addr) },
3049         [IFLA_VXLAN_LINK]       = { .type = NLA_U32 },
3050         [IFLA_VXLAN_LOCAL]      = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
3051         [IFLA_VXLAN_LOCAL6]     = { .len = sizeof(struct in6_addr) },
3052         [IFLA_VXLAN_TOS]        = { .type = NLA_U8 },
3053         [IFLA_VXLAN_TTL]        = { .type = NLA_U8 },
3054         [IFLA_VXLAN_LABEL]      = { .type = NLA_U32 },
3055         [IFLA_VXLAN_LEARNING]   = { .type = NLA_U8 },
3056         [IFLA_VXLAN_AGEING]     = { .type = NLA_U32 },
3057         [IFLA_VXLAN_LIMIT]      = { .type = NLA_U32 },
3058         [IFLA_VXLAN_PORT_RANGE] = { .len  = sizeof(struct ifla_vxlan_port_range) },
3059         [IFLA_VXLAN_PROXY]      = { .type = NLA_U8 },
3060         [IFLA_VXLAN_RSC]        = { .type = NLA_U8 },
3061         [IFLA_VXLAN_L2MISS]     = { .type = NLA_U8 },
3062         [IFLA_VXLAN_L3MISS]     = { .type = NLA_U8 },
3063         [IFLA_VXLAN_COLLECT_METADATA]   = { .type = NLA_U8 },
3064         [IFLA_VXLAN_PORT]       = { .type = NLA_U16 },
3065         [IFLA_VXLAN_UDP_CSUM]   = { .type = NLA_U8 },
3066         [IFLA_VXLAN_UDP_ZERO_CSUM6_TX]  = { .type = NLA_U8 },
3067         [IFLA_VXLAN_UDP_ZERO_CSUM6_RX]  = { .type = NLA_U8 },
3068         [IFLA_VXLAN_REMCSUM_TX] = { .type = NLA_U8 },
3069         [IFLA_VXLAN_REMCSUM_RX] = { .type = NLA_U8 },
3070         [IFLA_VXLAN_GBP]        = { .type = NLA_FLAG, },
3071         [IFLA_VXLAN_GPE]        = { .type = NLA_FLAG, },
3072         [IFLA_VXLAN_REMCSUM_NOPARTIAL]  = { .type = NLA_FLAG },
3073         [IFLA_VXLAN_TTL_INHERIT]        = { .type = NLA_FLAG },
3074         [IFLA_VXLAN_DF]         = { .type = NLA_U8 },
3075 };
3076
3077 static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[],
3078                           struct netlink_ext_ack *extack)
3079 {
3080         if (tb[IFLA_ADDRESS]) {
3081                 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) {
3082                         NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_ADDRESS],
3083                                             "Provided link layer address is not Ethernet");
3084                         return -EINVAL;
3085                 }
3086
3087                 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) {
3088                         NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_ADDRESS],
3089                                             "Provided Ethernet address is not unicast");
3090                         return -EADDRNOTAVAIL;
3091                 }
3092         }
3093
3094         if (tb[IFLA_MTU]) {
3095                 u32 mtu = nla_get_u32(tb[IFLA_MTU]);
3096
3097                 if (mtu < ETH_MIN_MTU || mtu > ETH_MAX_MTU) {
3098                         NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_MTU],
3099                                             "MTU must be between 68 and 65535");
3100                         return -EINVAL;
3101                 }
3102         }
3103
3104         if (!data) {
3105                 NL_SET_ERR_MSG(extack,
3106                                "Required attributes not provided to perform the operation");
3107                 return -EINVAL;
3108         }
3109
3110         if (data[IFLA_VXLAN_ID]) {
3111                 u32 id = nla_get_u32(data[IFLA_VXLAN_ID]);
3112
3113                 if (id >= VXLAN_N_VID) {
3114                         NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_ID],
3115                                             "VXLAN ID must be lower than 16777216");
3116                         return -ERANGE;
3117                 }
3118         }
3119
3120         if (data[IFLA_VXLAN_PORT_RANGE]) {
3121                 const struct ifla_vxlan_port_range *p
3122                         = nla_data(data[IFLA_VXLAN_PORT_RANGE]);
3123
3124                 if (ntohs(p->high) < ntohs(p->low)) {
3125                         NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_PORT_RANGE],
3126                                             "Invalid source port range");
3127                         return -EINVAL;
3128                 }
3129         }
3130
3131         if (data[IFLA_VXLAN_DF]) {
3132                 enum ifla_vxlan_df df = nla_get_u8(data[IFLA_VXLAN_DF]);
3133
3134                 if (df < 0 || df > VXLAN_DF_MAX) {
3135                         NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_DF],
3136                                             "Invalid DF attribute");
3137                         return -EINVAL;
3138                 }
3139         }
3140
3141         return 0;
3142 }
3143
3144 static void vxlan_get_drvinfo(struct net_device *netdev,
3145                               struct ethtool_drvinfo *drvinfo)
3146 {
3147         strlcpy(drvinfo->version, VXLAN_VERSION, sizeof(drvinfo->version));
3148         strlcpy(drvinfo->driver, "vxlan", sizeof(drvinfo->driver));
3149 }
3150
3151 static const struct ethtool_ops vxlan_ethtool_ops = {
3152         .get_drvinfo    = vxlan_get_drvinfo,
3153         .get_link       = ethtool_op_get_link,
3154 };
3155
3156 static struct socket *vxlan_create_sock(struct net *net, bool ipv6,
3157                                         __be16 port, u32 flags, int ifindex)
3158 {
3159         struct socket *sock;
3160         struct udp_port_cfg udp_conf;
3161         int err;
3162
3163         memset(&udp_conf, 0, sizeof(udp_conf));
3164
3165         if (ipv6) {
3166                 udp_conf.family = AF_INET6;
3167                 udp_conf.use_udp6_rx_checksums =
3168                     !(flags & VXLAN_F_UDP_ZERO_CSUM6_RX);
3169                 udp_conf.ipv6_v6only = 1;
3170         } else {
3171                 udp_conf.family = AF_INET;
3172         }
3173
3174         udp_conf.local_udp_port = port;
3175         udp_conf.bind_ifindex = ifindex;
3176
3177         /* Open UDP socket */
3178         err = udp_sock_create(net, &udp_conf, &sock);
3179         if (err < 0)
3180                 return ERR_PTR(err);
3181
3182         return sock;
3183 }
3184
3185 /* Create new listen socket if needed */
3186 static struct vxlan_sock *vxlan_socket_create(struct net *net, bool ipv6,
3187                                               __be16 port, u32 flags,
3188                                               int ifindex)
3189 {
3190         struct vxlan_net *vn = net_generic(net, vxlan_net_id);
3191         struct vxlan_sock *vs;
3192         struct socket *sock;
3193         unsigned int h;
3194         struct udp_tunnel_sock_cfg tunnel_cfg;
3195
3196         vs = kzalloc(sizeof(*vs), GFP_KERNEL);
3197         if (!vs)
3198                 return ERR_PTR(-ENOMEM);
3199
3200         for (h = 0; h < VNI_HASH_SIZE; ++h)
3201                 INIT_HLIST_HEAD(&vs->vni_list[h]);
3202
3203         sock = vxlan_create_sock(net, ipv6, port, flags, ifindex);
3204         if (IS_ERR(sock)) {
3205                 kfree(vs);
3206                 return ERR_CAST(sock);
3207         }
3208
3209         vs->sock = sock;
3210         refcount_set(&vs->refcnt, 1);
3211         vs->flags = (flags & VXLAN_F_RCV_FLAGS);
3212
3213         spin_lock(&vn->sock_lock);
3214         hlist_add_head_rcu(&vs->hlist, vs_head(net, port));
3215         udp_tunnel_notify_add_rx_port(sock,
3216                                       (vs->flags & VXLAN_F_GPE) ?
3217                                       UDP_TUNNEL_TYPE_VXLAN_GPE :
3218                                       UDP_TUNNEL_TYPE_VXLAN);
3219         spin_unlock(&vn->sock_lock);
3220
3221         /* Mark socket as an encapsulation socket. */
3222         memset(&tunnel_cfg, 0, sizeof(tunnel_cfg));
3223         tunnel_cfg.sk_user_data = vs;
3224         tunnel_cfg.encap_type = 1;
3225         tunnel_cfg.encap_rcv = vxlan_rcv;
3226         tunnel_cfg.encap_err_lookup = vxlan_err_lookup;
3227         tunnel_cfg.encap_destroy = NULL;
3228         tunnel_cfg.gro_receive = vxlan_gro_receive;
3229         tunnel_cfg.gro_complete = vxlan_gro_complete;
3230
3231         setup_udp_tunnel_sock(net, sock, &tunnel_cfg);
3232
3233         return vs;
3234 }
3235
3236 static int __vxlan_sock_add(struct vxlan_dev *vxlan, bool ipv6)
3237 {
3238         struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
3239         struct vxlan_sock *vs = NULL;
3240         struct vxlan_dev_node *node;
3241         int l3mdev_index = 0;
3242
3243         if (vxlan->cfg.remote_ifindex)
3244                 l3mdev_index = l3mdev_master_upper_ifindex_by_index(
3245                         vxlan->net, vxlan->cfg.remote_ifindex);
3246
3247         if (!vxlan->cfg.no_share) {
3248                 spin_lock(&vn->sock_lock);
3249                 vs = vxlan_find_sock(vxlan->net, ipv6 ? AF_INET6 : AF_INET,
3250                                      vxlan->cfg.dst_port, vxlan->cfg.flags,
3251                                      l3mdev_index);
3252                 if (vs && !refcount_inc_not_zero(&vs->refcnt)) {
3253                         spin_unlock(&vn->sock_lock);
3254                         return -EBUSY;
3255                 }
3256                 spin_unlock(&vn->sock_lock);
3257         }
3258         if (!vs)
3259                 vs = vxlan_socket_create(vxlan->net, ipv6,
3260                                          vxlan->cfg.dst_port, vxlan->cfg.flags,
3261                                          l3mdev_index);
3262         if (IS_ERR(vs))
3263                 return PTR_ERR(vs);
3264 #if IS_ENABLED(CONFIG_IPV6)
3265         if (ipv6) {
3266                 rcu_assign_pointer(vxlan->vn6_sock, vs);
3267                 node = &vxlan->hlist6;
3268         } else
3269 #endif
3270         {
3271                 rcu_assign_pointer(vxlan->vn4_sock, vs);
3272                 node = &vxlan->hlist4;
3273         }
3274         vxlan_vs_add_dev(vs, vxlan, node);
3275         return 0;
3276 }
3277
3278 static int vxlan_sock_add(struct vxlan_dev *vxlan)
3279 {
3280         bool metadata = vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA;
3281         bool ipv6 = vxlan->cfg.flags & VXLAN_F_IPV6 || metadata;
3282         bool ipv4 = !ipv6 || metadata;
3283         int ret = 0;
3284
3285         RCU_INIT_POINTER(vxlan->vn4_sock, NULL);
3286 #if IS_ENABLED(CONFIG_IPV6)
3287         RCU_INIT_POINTER(vxlan->vn6_sock, NULL);
3288         if (ipv6) {
3289                 ret = __vxlan_sock_add(vxlan, true);
3290                 if (ret < 0 && ret != -EAFNOSUPPORT)
3291                         ipv4 = false;
3292         }
3293 #endif
3294         if (ipv4)
3295                 ret = __vxlan_sock_add(vxlan, false);
3296         if (ret < 0)
3297                 vxlan_sock_release(vxlan);
3298         return ret;
3299 }
3300
3301 static int vxlan_config_validate(struct net *src_net, struct vxlan_config *conf,
3302                                  struct net_device **lower,
3303                                  struct vxlan_dev *old,
3304                                  struct netlink_ext_ack *extack)
3305 {
3306         struct vxlan_net *vn = net_generic(src_net, vxlan_net_id);
3307         struct vxlan_dev *tmp;
3308         bool use_ipv6 = false;
3309
3310         if (conf->flags & VXLAN_F_GPE) {
3311                 /* For now, allow GPE only together with
3312                  * COLLECT_METADATA. This can be relaxed later; in such
3313                  * case, the other side of the PtP link will have to be
3314                  * provided.
3315                  */
3316                 if ((conf->flags & ~VXLAN_F_ALLOWED_GPE) ||
3317                     !(conf->flags & VXLAN_F_COLLECT_METADATA)) {
3318                         NL_SET_ERR_MSG(extack,
3319                                        "VXLAN GPE does not support this combination of attributes");
3320                         return -EINVAL;
3321                 }
3322         }
3323
3324         if (!conf->remote_ip.sa.sa_family && !conf->saddr.sa.sa_family) {
3325                 /* Unless IPv6 is explicitly requested, assume IPv4 */
3326                 conf->remote_ip.sa.sa_family = AF_INET;
3327                 conf->saddr.sa.sa_family = AF_INET;
3328         } else if (!conf->remote_ip.sa.sa_family) {
3329                 conf->remote_ip.sa.sa_family = conf->saddr.sa.sa_family;
3330         } else if (!conf->saddr.sa.sa_family) {
3331                 conf->saddr.sa.sa_family = conf->remote_ip.sa.sa_family;
3332         }
3333
3334         if (conf->saddr.sa.sa_family != conf->remote_ip.sa.sa_family) {
3335                 NL_SET_ERR_MSG(extack,
3336                                "Local and remote address must be from the same family");
3337                 return -EINVAL;
3338         }
3339
3340         if (vxlan_addr_multicast(&conf->saddr)) {
3341                 NL_SET_ERR_MSG(extack, "Local address cannot be multicast");
3342                 return -EINVAL;
3343         }
3344
3345         if (conf->saddr.sa.sa_family == AF_INET6) {
3346                 if (!IS_ENABLED(CONFIG_IPV6)) {
3347                         NL_SET_ERR_MSG(extack,
3348                                        "IPv6 support not enabled in the kernel");
3349                         return -EPFNOSUPPORT;
3350                 }
3351                 use_ipv6 = true;
3352                 conf->flags |= VXLAN_F_IPV6;
3353
3354                 if (!(conf->flags & VXLAN_F_COLLECT_METADATA)) {
3355                         int local_type =
3356                                 ipv6_addr_type(&conf->saddr.sin6.sin6_addr);
3357                         int remote_type =
3358                                 ipv6_addr_type(&conf->remote_ip.sin6.sin6_addr);
3359
3360                         if (local_type & IPV6_ADDR_LINKLOCAL) {
3361                                 if (!(remote_type & IPV6_ADDR_LINKLOCAL) &&
3362                                     (remote_type != IPV6_ADDR_ANY)) {
3363                                         NL_SET_ERR_MSG(extack,
3364                                                        "Invalid combination of local and remote address scopes");
3365                                         return -EINVAL;
3366                                 }
3367
3368                                 conf->flags |= VXLAN_F_IPV6_LINKLOCAL;
3369                         } else {
3370                                 if (remote_type ==
3371                                     (IPV6_ADDR_UNICAST | IPV6_ADDR_LINKLOCAL)) {
3372                                         NL_SET_ERR_MSG(extack,
3373                                                        "Invalid combination of local and remote address scopes");
3374                                         return -EINVAL;
3375                                 }
3376
3377                                 conf->flags &= ~VXLAN_F_IPV6_LINKLOCAL;
3378                         }
3379                 }
3380         }
3381
3382         if (conf->label && !use_ipv6) {
3383                 NL_SET_ERR_MSG(extack,
3384                                "Label attribute only applies to IPv6 VXLAN devices");
3385                 return -EINVAL;
3386         }
3387
3388         if (conf->remote_ifindex) {
3389                 struct net_device *lowerdev;
3390
3391                 lowerdev = __dev_get_by_index(src_net, conf->remote_ifindex);
3392                 if (!lowerdev) {
3393                         NL_SET_ERR_MSG(extack,
3394                                        "Invalid local interface, device not found");
3395                         return -ENODEV;
3396                 }
3397
3398 #if IS_ENABLED(CONFIG_IPV6)
3399                 if (use_ipv6) {
3400                         struct inet6_dev *idev = __in6_dev_get(lowerdev);
3401                         if (idev && idev->cnf.disable_ipv6) {
3402                                 NL_SET_ERR_MSG(extack,
3403                                                "IPv6 support disabled by administrator");
3404                                 return -EPERM;
3405                         }
3406                 }
3407 #endif
3408
3409                 *lower = lowerdev;
3410         } else {
3411                 if (vxlan_addr_multicast(&conf->remote_ip)) {
3412                         NL_SET_ERR_MSG(extack,
3413                                        "Local interface required for multicast remote destination");
3414
3415                         return -EINVAL;
3416                 }
3417
3418 #if IS_ENABLED(CONFIG_IPV6)
3419                 if (conf->flags & VXLAN_F_IPV6_LINKLOCAL) {
3420                         NL_SET_ERR_MSG(extack,
3421                                        "Local interface required for link-local local/remote addresses");
3422                         return -EINVAL;
3423                 }
3424 #endif
3425
3426                 *lower = NULL;
3427         }
3428
3429         if (!conf->dst_port) {
3430                 if (conf->flags & VXLAN_F_GPE)
3431                         conf->dst_port = htons(4790); /* IANA VXLAN-GPE port */
3432                 else
3433                         conf->dst_port = htons(vxlan_port);
3434         }
3435
3436         if (!conf->age_interval)
3437                 conf->age_interval = FDB_AGE_DEFAULT;
3438
3439         list_for_each_entry(tmp, &vn->vxlan_list, next) {
3440                 if (tmp == old)
3441                         continue;
3442
3443                 if (tmp->cfg.vni != conf->vni)
3444                         continue;
3445                 if (tmp->cfg.dst_port != conf->dst_port)
3446                         continue;
3447                 if ((tmp->cfg.flags & (VXLAN_F_RCV_FLAGS | VXLAN_F_IPV6)) !=
3448                     (conf->flags & (VXLAN_F_RCV_FLAGS | VXLAN_F_IPV6)))
3449                         continue;
3450
3451                 if ((conf->flags & VXLAN_F_IPV6_LINKLOCAL) &&
3452                     tmp->cfg.remote_ifindex != conf->remote_ifindex)
3453                         continue;
3454
3455                 NL_SET_ERR_MSG(extack,
3456                                "A VXLAN device with the specified VNI already exists");
3457                 return -EEXIST;
3458         }
3459
3460         return 0;
3461 }
3462
3463 static void vxlan_config_apply(struct net_device *dev,
3464                                struct vxlan_config *conf,
3465                                struct net_device *lowerdev,
3466                                struct net *src_net,
3467                                bool changelink)
3468 {
3469         struct vxlan_dev *vxlan = netdev_priv(dev);
3470         struct vxlan_rdst *dst = &vxlan->default_dst;
3471         unsigned short needed_headroom = ETH_HLEN;
3472         bool use_ipv6 = !!(conf->flags & VXLAN_F_IPV6);
3473         int max_mtu = ETH_MAX_MTU;
3474
3475         if (!changelink) {
3476                 if (conf->flags & VXLAN_F_GPE)
3477                         vxlan_raw_setup(dev);
3478                 else
3479                         vxlan_ether_setup(dev);
3480
3481                 if (conf->mtu)
3482                         dev->mtu = conf->mtu;
3483
3484                 vxlan->net = src_net;
3485         }
3486
3487         dst->remote_vni = conf->vni;
3488
3489         memcpy(&dst->remote_ip, &conf->remote_ip, sizeof(conf->remote_ip));
3490
3491         if (lowerdev) {
3492                 dst->remote_ifindex = conf->remote_ifindex;
3493
3494                 dev->gso_max_size = lowerdev->gso_max_size;
3495                 dev->gso_max_segs = lowerdev->gso_max_segs;
3496
3497                 needed_headroom = lowerdev->hard_header_len;
3498
3499                 max_mtu = lowerdev->mtu - (use_ipv6 ? VXLAN6_HEADROOM :
3500                                            VXLAN_HEADROOM);
3501                 if (max_mtu < ETH_MIN_MTU)
3502                         max_mtu = ETH_MIN_MTU;
3503
3504                 if (!changelink && !conf->mtu)
3505                         dev->mtu = max_mtu;
3506         }
3507
3508         if (dev->mtu > max_mtu)
3509                 dev->mtu = max_mtu;
3510
3511         if (use_ipv6 || conf->flags & VXLAN_F_COLLECT_METADATA)
3512                 needed_headroom += VXLAN6_HEADROOM;
3513         else
3514                 needed_headroom += VXLAN_HEADROOM;
3515         dev->needed_headroom = needed_headroom;
3516
3517         memcpy(&vxlan->cfg, conf, sizeof(*conf));
3518 }
3519
3520 static int vxlan_dev_configure(struct net *src_net, struct net_device *dev,
3521                                struct vxlan_config *conf, bool changelink,
3522                                struct netlink_ext_ack *extack)
3523 {
3524         struct vxlan_dev *vxlan = netdev_priv(dev);
3525         struct net_device *lowerdev;
3526         int ret;
3527
3528         ret = vxlan_config_validate(src_net, conf, &lowerdev, vxlan, extack);
3529         if (ret)
3530                 return ret;
3531
3532         vxlan_config_apply(dev, conf, lowerdev, src_net, changelink);
3533
3534         return 0;
3535 }
3536
3537 static int __vxlan_dev_create(struct net *net, struct net_device *dev,
3538                               struct vxlan_config *conf,
3539                               struct netlink_ext_ack *extack)
3540 {
3541         struct vxlan_net *vn = net_generic(net, vxlan_net_id);
3542         struct vxlan_dev *vxlan = netdev_priv(dev);
3543         struct vxlan_fdb *f = NULL;
3544         bool unregister = false;
3545         int err;
3546
3547         err = vxlan_dev_configure(net, dev, conf, false, extack);
3548         if (err)
3549                 return err;
3550
3551         dev->ethtool_ops = &vxlan_ethtool_ops;
3552
3553         /* create an fdb entry for a valid default destination */
3554         if (!vxlan_addr_any(&vxlan->default_dst.remote_ip)) {
3555                 err = vxlan_fdb_create(vxlan, all_zeros_mac,
3556                                        &vxlan->default_dst.remote_ip,
3557                                        NUD_REACHABLE | NUD_PERMANENT,
3558                                        vxlan->cfg.dst_port,
3559                                        vxlan->default_dst.remote_vni,
3560                                        vxlan->default_dst.remote_vni,
3561                                        vxlan->default_dst.remote_ifindex,
3562                                        NTF_SELF, &f);
3563                 if (err)
3564                         return err;
3565         }
3566
3567         err = register_netdevice(dev);
3568         if (err)
3569                 goto errout;
3570         unregister = true;
3571
3572         err = rtnl_configure_link(dev, NULL);
3573         if (err)
3574                 goto errout;
3575
3576         /* notify default fdb entry */
3577         if (f) {
3578                 err = vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f),
3579                                        RTM_NEWNEIGH, true, extack);
3580                 if (err)
3581                         goto errout;
3582         }
3583
3584         list_add(&vxlan->next, &vn->vxlan_list);
3585         return 0;
3586
3587 errout:
3588         /* unregister_netdevice() destroys the default FDB entry with deletion
3589          * notification. But the addition notification was not sent yet, so
3590          * destroy the entry by hand here.
3591          */
3592         if (f)
3593                 vxlan_fdb_destroy(vxlan, f, false, false);
3594         if (unregister)
3595                 unregister_netdevice(dev);
3596         return err;
3597 }
3598
3599 /* Set/clear flags based on attribute */
3600 static int vxlan_nl2flag(struct vxlan_config *conf, struct nlattr *tb[],
3601                           int attrtype, unsigned long mask, bool changelink,
3602                           bool changelink_supported,
3603                           struct netlink_ext_ack *extack)
3604 {
3605         unsigned long flags;
3606
3607         if (!tb[attrtype])
3608                 return 0;
3609
3610         if (changelink && !changelink_supported) {
3611                 vxlan_flag_attr_error(attrtype, extack);
3612                 return -EOPNOTSUPP;
3613         }
3614
3615         if (vxlan_policy[attrtype].type == NLA_FLAG)
3616                 flags = conf->flags | mask;
3617         else if (nla_get_u8(tb[attrtype]))
3618                 flags = conf->flags | mask;
3619         else
3620                 flags = conf->flags & ~mask;
3621
3622         conf->flags = flags;
3623
3624         return 0;
3625 }
3626
3627 static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[],
3628                          struct net_device *dev, struct vxlan_config *conf,
3629                          bool changelink, struct netlink_ext_ack *extack)
3630 {
3631         struct vxlan_dev *vxlan = netdev_priv(dev);
3632         int err = 0;
3633
3634         memset(conf, 0, sizeof(*conf));
3635
3636         /* if changelink operation, start with old existing cfg */
3637         if (changelink)
3638                 memcpy(conf, &vxlan->cfg, sizeof(*conf));
3639
3640         if (data[IFLA_VXLAN_ID]) {
3641                 __be32 vni = cpu_to_be32(nla_get_u32(data[IFLA_VXLAN_ID]));
3642
3643                 if (changelink && (vni != conf->vni)) {
3644                         NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_ID], "Cannot change VNI");
3645                         return -EOPNOTSUPP;
3646                 }
3647                 conf->vni = cpu_to_be32(nla_get_u32(data[IFLA_VXLAN_ID]));
3648         }
3649
3650         if (data[IFLA_VXLAN_GROUP]) {
3651                 if (changelink && (conf->remote_ip.sa.sa_family != AF_INET)) {
3652                         NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_GROUP], "New group address family does not match old group");
3653                         return -EOPNOTSUPP;
3654                 }
3655
3656                 conf->remote_ip.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_GROUP]);
3657                 conf->remote_ip.sa.sa_family = AF_INET;
3658         } else if (data[IFLA_VXLAN_GROUP6]) {
3659                 if (!IS_ENABLED(CONFIG_IPV6)) {
3660                         NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_GROUP6], "IPv6 support not enabled in the kernel");
3661                         return -EPFNOSUPPORT;
3662                 }
3663
3664                 if (changelink && (conf->remote_ip.sa.sa_family != AF_INET6)) {
3665                         NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_GROUP6], "New group address family does not match old group");
3666                         return -EOPNOTSUPP;
3667                 }
3668
3669                 conf->remote_ip.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_GROUP6]);
3670                 conf->remote_ip.sa.sa_family = AF_INET6;
3671         }
3672
3673         if (data[IFLA_VXLAN_LOCAL]) {
3674                 if (changelink && (conf->saddr.sa.sa_family != AF_INET)) {
3675                         NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_LOCAL], "New local address family does not match old");
3676                         return -EOPNOTSUPP;
3677                 }
3678
3679                 conf->saddr.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_LOCAL]);
3680                 conf->saddr.sa.sa_family = AF_INET;
3681         } else if (data[IFLA_VXLAN_LOCAL6]) {
3682                 if (!IS_ENABLED(CONFIG_IPV6)) {
3683                         NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_LOCAL6], "IPv6 support not enabled in the kernel");
3684                         return -EPFNOSUPPORT;
3685                 }
3686
3687                 if (changelink && (conf->saddr.sa.sa_family != AF_INET6)) {
3688                         NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_LOCAL6], "New local address family does not match old");
3689                         return -EOPNOTSUPP;
3690                 }
3691
3692                 /* TODO: respect scope id */
3693                 conf->saddr.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_LOCAL6]);
3694                 conf->saddr.sa.sa_family = AF_INET6;
3695         }
3696
3697         if (data[IFLA_VXLAN_LINK])
3698                 conf->remote_ifindex = nla_get_u32(data[IFLA_VXLAN_LINK]);
3699
3700         if (data[IFLA_VXLAN_TOS])
3701                 conf->tos  = nla_get_u8(data[IFLA_VXLAN_TOS]);
3702
3703         if (data[IFLA_VXLAN_TTL])
3704                 conf->ttl = nla_get_u8(data[IFLA_VXLAN_TTL]);
3705
3706         if (data[IFLA_VXLAN_TTL_INHERIT]) {
3707                 err = vxlan_nl2flag(conf, data, IFLA_VXLAN_TTL_INHERIT,
3708                                     VXLAN_F_TTL_INHERIT, changelink, false,
3709                                     extack);
3710                 if (err)
3711                         return err;
3712
3713         }
3714
3715         if (data[IFLA_VXLAN_LABEL])
3716                 conf->label = nla_get_be32(data[IFLA_VXLAN_LABEL]) &
3717                              IPV6_FLOWLABEL_MASK;
3718
3719         if (data[IFLA_VXLAN_LEARNING]) {
3720                 err = vxlan_nl2flag(conf, data, IFLA_VXLAN_LEARNING,
3721                                     VXLAN_F_LEARN, changelink, true,
3722                                     extack);
3723                 if (err)
3724                         return err;
3725         } else if (!changelink) {
3726                 /* default to learn on a new device */
3727                 conf->flags |= VXLAN_F_LEARN;
3728         }
3729
3730         if (data[IFLA_VXLAN_AGEING])
3731                 conf->age_interval = nla_get_u32(data[IFLA_VXLAN_AGEING]);
3732
3733         if (data[IFLA_VXLAN_PROXY]) {
3734                 err = vxlan_nl2flag(conf, data, IFLA_VXLAN_PROXY,
3735                                     VXLAN_F_PROXY, changelink, false,
3736                                     extack);
3737                 if (err)
3738                         return err;
3739         }
3740
3741         if (data[IFLA_VXLAN_RSC]) {
3742                 err = vxlan_nl2flag(conf, data, IFLA_VXLAN_RSC,
3743                                     VXLAN_F_RSC, changelink, false,
3744                                     extack);
3745                 if (err)
3746                         return err;
3747         }
3748
3749         if (data[IFLA_VXLAN_L2MISS]) {
3750                 err = vxlan_nl2flag(conf, data, IFLA_VXLAN_L2MISS,
3751                                     VXLAN_F_L2MISS, changelink, false,
3752                                     extack);
3753                 if (err)
3754                         return err;
3755         }
3756
3757         if (data[IFLA_VXLAN_L3MISS]) {
3758                 err = vxlan_nl2flag(conf, data, IFLA_VXLAN_L3MISS,
3759                                     VXLAN_F_L3MISS, changelink, false,
3760                                     extack);
3761                 if (err)
3762                         return err;
3763         }
3764
3765         if (data[IFLA_VXLAN_LIMIT]) {
3766                 if (changelink) {
3767                         NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_LIMIT],
3768                                             "Cannot change limit");
3769                         return -EOPNOTSUPP;
3770                 }
3771                 conf->addrmax = nla_get_u32(data[IFLA_VXLAN_LIMIT]);
3772         }
3773
3774         if (data[IFLA_VXLAN_COLLECT_METADATA]) {
3775                 err = vxlan_nl2flag(conf, data, IFLA_VXLAN_COLLECT_METADATA,
3776                                     VXLAN_F_COLLECT_METADATA, changelink, false,
3777                                     extack);
3778                 if (err)
3779                         return err;
3780         }
3781
3782         if (data[IFLA_VXLAN_PORT_RANGE]) {
3783                 if (!changelink) {
3784                         const struct ifla_vxlan_port_range *p
3785                                 = nla_data(data[IFLA_VXLAN_PORT_RANGE]);
3786                         conf->port_min = ntohs(p->low);
3787                         conf->port_max = ntohs(p->high);
3788                 } else {
3789                         NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_PORT_RANGE],
3790                                             "Cannot change port range");
3791                         return -EOPNOTSUPP;
3792                 }
3793         }
3794
3795         if (data[IFLA_VXLAN_PORT]) {
3796                 if (changelink) {
3797                         NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_PORT],
3798                                             "Cannot change port");
3799                         return -EOPNOTSUPP;
3800                 }
3801                 conf->dst_port = nla_get_be16(data[IFLA_VXLAN_PORT]);
3802         }
3803
3804         if (data[IFLA_VXLAN_UDP_CSUM]) {
3805                 if (changelink) {
3806                         NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_UDP_CSUM],
3807                                             "Cannot change UDP_CSUM flag");
3808                         return -EOPNOTSUPP;
3809                 }
3810                 if (!nla_get_u8(data[IFLA_VXLAN_UDP_CSUM]))
3811                         conf->flags |= VXLAN_F_UDP_ZERO_CSUM_TX;
3812         }
3813
3814         if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX]) {
3815                 err = vxlan_nl2flag(conf, data, IFLA_VXLAN_UDP_ZERO_CSUM6_TX,
3816                                     VXLAN_F_UDP_ZERO_CSUM6_TX, changelink,
3817                                     false, extack);
3818                 if (err)
3819                         return err;
3820         }
3821
3822         if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_RX]) {
3823                 err = vxlan_nl2flag(conf, data, IFLA_VXLAN_UDP_ZERO_CSUM6_RX,
3824                                     VXLAN_F_UDP_ZERO_CSUM6_RX, changelink,
3825                                     false, extack);
3826                 if (err)
3827                         return err;
3828         }
3829
3830         if (data[IFLA_VXLAN_REMCSUM_TX]) {
3831                 err = vxlan_nl2flag(conf, data, IFLA_VXLAN_REMCSUM_TX,
3832                                     VXLAN_F_REMCSUM_TX, changelink, false,
3833                                     extack);
3834                 if (err)
3835                         return err;
3836         }
3837
3838         if (data[IFLA_VXLAN_REMCSUM_RX]) {
3839                 err = vxlan_nl2flag(conf, data, IFLA_VXLAN_REMCSUM_RX,
3840                                     VXLAN_F_REMCSUM_RX, changelink, false,
3841                                     extack);
3842                 if (err)
3843                         return err;
3844         }
3845
3846         if (data[IFLA_VXLAN_GBP]) {
3847                 err = vxlan_nl2flag(conf, data, IFLA_VXLAN_GBP,
3848                                     VXLAN_F_GBP, changelink, false, extack);
3849                 if (err)
3850                         return err;
3851         }
3852
3853         if (data[IFLA_VXLAN_GPE]) {
3854                 err = vxlan_nl2flag(conf, data, IFLA_VXLAN_GPE,
3855                                     VXLAN_F_GPE, changelink, false,
3856                                     extack);
3857                 if (err)
3858                         return err;
3859         }
3860
3861         if (data[IFLA_VXLAN_REMCSUM_NOPARTIAL]) {
3862                 err = vxlan_nl2flag(conf, data, IFLA_VXLAN_REMCSUM_NOPARTIAL,
3863                                     VXLAN_F_REMCSUM_NOPARTIAL, changelink,
3864                                     false, extack);
3865                 if (err)
3866                         return err;
3867         }
3868
3869         if (tb[IFLA_MTU]) {
3870                 if (changelink) {
3871                         NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_MTU],
3872                                             "Cannot change mtu");
3873                         return -EOPNOTSUPP;
3874                 }
3875                 conf->mtu = nla_get_u32(tb[IFLA_MTU]);
3876         }
3877
3878         if (data[IFLA_VXLAN_DF])
3879                 conf->df = nla_get_u8(data[IFLA_VXLAN_DF]);
3880
3881         return 0;
3882 }
3883
3884 static int vxlan_newlink(struct net *src_net, struct net_device *dev,
3885                          struct nlattr *tb[], struct nlattr *data[],
3886                          struct netlink_ext_ack *extack)
3887 {
3888         struct vxlan_config conf;
3889         int err;
3890
3891         err = vxlan_nl2conf(tb, data, dev, &conf, false, extack);
3892         if (err)
3893                 return err;
3894
3895         return __vxlan_dev_create(src_net, dev, &conf, extack);
3896 }
3897
3898 static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[],
3899                             struct nlattr *data[],
3900                             struct netlink_ext_ack *extack)
3901 {
3902         struct vxlan_dev *vxlan = netdev_priv(dev);
3903         struct vxlan_rdst *dst = &vxlan->default_dst;
3904         struct net_device *lowerdev;
3905         struct vxlan_config conf;
3906         int err;
3907
3908         err = vxlan_nl2conf(tb, data, dev, &conf, true, extack);
3909         if (err)
3910                 return err;
3911
3912         err = vxlan_config_validate(vxlan->net, &conf, &lowerdev,
3913                                     vxlan, extack);
3914         if (err)
3915                 return err;
3916
3917         /* handle default dst entry */
3918         if (!vxlan_addr_equal(&conf.remote_ip, &dst->remote_ip)) {
3919                 spin_lock_bh(&vxlan->hash_lock);
3920                 if (!vxlan_addr_any(&conf.remote_ip)) {
3921                         err = vxlan_fdb_update(vxlan, all_zeros_mac,
3922                                                &conf.remote_ip,
3923                                                NUD_REACHABLE | NUD_PERMANENT,
3924                                                NLM_F_APPEND | NLM_F_CREATE,
3925                                                vxlan->cfg.dst_port,
3926                                                conf.vni, conf.vni,
3927                                                conf.remote_ifindex,
3928                                                NTF_SELF, true, extack);
3929                         if (err) {
3930                                 spin_unlock_bh(&vxlan->hash_lock);
3931                                 return err;
3932                         }
3933                 }
3934                 if (!vxlan_addr_any(&dst->remote_ip))
3935                         __vxlan_fdb_delete(vxlan, all_zeros_mac,
3936                                            dst->remote_ip,
3937                                            vxlan->cfg.dst_port,
3938                                            dst->remote_vni,
3939                                            dst->remote_vni,
3940                                            dst->remote_ifindex,
3941                                            true);
3942                 spin_unlock_bh(&vxlan->hash_lock);
3943         }
3944
3945         if (conf.age_interval != vxlan->cfg.age_interval)
3946                 mod_timer(&vxlan->age_timer, jiffies);
3947
3948         vxlan_config_apply(dev, &conf, lowerdev, vxlan->net, true);
3949         return 0;
3950 }
3951
3952 static void vxlan_dellink(struct net_device *dev, struct list_head *head)
3953 {
3954         struct vxlan_dev *vxlan = netdev_priv(dev);
3955
3956         vxlan_flush(vxlan, true);
3957
3958         list_del(&vxlan->next);
3959         unregister_netdevice_queue(dev, head);
3960 }
3961
3962 static size_t vxlan_get_size(const struct net_device *dev)
3963 {
3964
3965         return nla_total_size(sizeof(__u32)) +  /* IFLA_VXLAN_ID */
3966                 nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_GROUP{6} */
3967                 nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LINK */
3968                 nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_LOCAL{6} */
3969                 nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_TTL */
3970                 nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_TTL_INHERIT */
3971                 nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_TOS */
3972                 nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_DF */
3973                 nla_total_size(sizeof(__be32)) + /* IFLA_VXLAN_LABEL */
3974                 nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_LEARNING */
3975                 nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_PROXY */
3976                 nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_RSC */
3977                 nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_L2MISS */
3978                 nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_L3MISS */
3979                 nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_COLLECT_METADATA */
3980                 nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_AGEING */
3981                 nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LIMIT */
3982                 nla_total_size(sizeof(struct ifla_vxlan_port_range)) +
3983                 nla_total_size(sizeof(__be16)) + /* IFLA_VXLAN_PORT */
3984                 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_CSUM */
3985                 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_TX */
3986                 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_RX */
3987                 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_TX */
3988                 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_RX */
3989                 0;
3990 }
3991
3992 static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
3993 {
3994         const struct vxlan_dev *vxlan = netdev_priv(dev);
3995         const struct vxlan_rdst *dst = &vxlan->default_dst;
3996         struct ifla_vxlan_port_range ports = {
3997                 .low =  htons(vxlan->cfg.port_min),
3998                 .high = htons(vxlan->cfg.port_max),
3999         };
4000
4001         if (nla_put_u32(skb, IFLA_VXLAN_ID, be32_to_cpu(dst->remote_vni)))
4002                 goto nla_put_failure;
4003
4004         if (!vxlan_addr_any(&dst->remote_ip)) {
4005                 if (dst->remote_ip.sa.sa_family == AF_INET) {
4006                         if (nla_put_in_addr(skb, IFLA_VXLAN_GROUP,
4007                                             dst->remote_ip.sin.sin_addr.s_addr))
4008                                 goto nla_put_failure;
4009 #if IS_ENABLED(CONFIG_IPV6)
4010                 } else {
4011                         if (nla_put_in6_addr(skb, IFLA_VXLAN_GROUP6,
4012                                              &dst->remote_ip.sin6.sin6_addr))
4013                                 goto nla_put_failure;
4014 #endif
4015                 }
4016         }
4017
4018         if (dst->remote_ifindex && nla_put_u32(skb, IFLA_VXLAN_LINK, dst->remote_ifindex))
4019                 goto nla_put_failure;
4020
4021         if (!vxlan_addr_any(&vxlan->cfg.saddr)) {
4022                 if (vxlan->cfg.saddr.sa.sa_family == AF_INET) {
4023                         if (nla_put_in_addr(skb, IFLA_VXLAN_LOCAL,
4024                                             vxlan->cfg.saddr.sin.sin_addr.s_addr))
4025                                 goto nla_put_failure;
4026 #if IS_ENABLED(CONFIG_IPV6)
4027                 } else {
4028                         if (nla_put_in6_addr(skb, IFLA_VXLAN_LOCAL6,
4029                                              &vxlan->cfg.saddr.sin6.sin6_addr))
4030                                 goto nla_put_failure;
4031 #endif
4032                 }
4033         }
4034
4035         if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->cfg.ttl) ||
4036             nla_put_u8(skb, IFLA_VXLAN_TTL_INHERIT,
4037                        !!(vxlan->cfg.flags & VXLAN_F_TTL_INHERIT)) ||
4038             nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->cfg.tos) ||
4039             nla_put_u8(skb, IFLA_VXLAN_DF, vxlan->cfg.df) ||
4040             nla_put_be32(skb, IFLA_VXLAN_LABEL, vxlan->cfg.label) ||
4041             nla_put_u8(skb, IFLA_VXLAN_LEARNING,
4042                         !!(vxlan->cfg.flags & VXLAN_F_LEARN)) ||
4043             nla_put_u8(skb, IFLA_VXLAN_PROXY,
4044                         !!(vxlan->cfg.flags & VXLAN_F_PROXY)) ||
4045             nla_put_u8(skb, IFLA_VXLAN_RSC,
4046                        !!(vxlan->cfg.flags & VXLAN_F_RSC)) ||
4047             nla_put_u8(skb, IFLA_VXLAN_L2MISS,
4048                         !!(vxlan->cfg.flags & VXLAN_F_L2MISS)) ||
4049             nla_put_u8(skb, IFLA_VXLAN_L3MISS,
4050                         !!(vxlan->cfg.flags & VXLAN_F_L3MISS)) ||
4051             nla_put_u8(skb, IFLA_VXLAN_COLLECT_METADATA,
4052                        !!(vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA)) ||
4053             nla_put_u32(skb, IFLA_VXLAN_AGEING, vxlan->cfg.age_interval) ||
4054             nla_put_u32(skb, IFLA_VXLAN_LIMIT, vxlan->cfg.addrmax) ||
4055             nla_put_be16(skb, IFLA_VXLAN_PORT, vxlan->cfg.dst_port) ||
4056             nla_put_u8(skb, IFLA_VXLAN_UDP_CSUM,
4057                         !(vxlan->cfg.flags & VXLAN_F_UDP_ZERO_CSUM_TX)) ||
4058             nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_TX,
4059                         !!(vxlan->cfg.flags & VXLAN_F_UDP_ZERO_CSUM6_TX)) ||
4060             nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_RX,
4061                         !!(vxlan->cfg.flags & VXLAN_F_UDP_ZERO_CSUM6_RX)) ||
4062             nla_put_u8(skb, IFLA_VXLAN_REMCSUM_TX,
4063                         !!(vxlan->cfg.flags & VXLAN_F_REMCSUM_TX)) ||
4064             nla_put_u8(skb, IFLA_VXLAN_REMCSUM_RX,
4065                         !!(vxlan->cfg.flags & VXLAN_F_REMCSUM_RX)))
4066                 goto nla_put_failure;
4067
4068         if (nla_put(skb, IFLA_VXLAN_PORT_RANGE, sizeof(ports), &ports))
4069                 goto nla_put_failure;
4070
4071         if (vxlan->cfg.flags & VXLAN_F_GBP &&
4072             nla_put_flag(skb, IFLA_VXLAN_GBP))
4073                 goto nla_put_failure;
4074
4075         if (vxlan->cfg.flags & VXLAN_F_GPE &&
4076             nla_put_flag(skb, IFLA_VXLAN_GPE))
4077                 goto nla_put_failure;
4078
4079         if (vxlan->cfg.flags & VXLAN_F_REMCSUM_NOPARTIAL &&
4080             nla_put_flag(skb, IFLA_VXLAN_REMCSUM_NOPARTIAL))
4081                 goto nla_put_failure;
4082
4083         return 0;
4084
4085 nla_put_failure:
4086         return -EMSGSIZE;
4087 }
4088
4089 static struct net *vxlan_get_link_net(const struct net_device *dev)
4090 {
4091         struct vxlan_dev *vxlan = netdev_priv(dev);
4092
4093         return vxlan->net;
4094 }
4095
4096 static struct rtnl_link_ops vxlan_link_ops __read_mostly = {
4097         .kind           = "vxlan",
4098         .maxtype        = IFLA_VXLAN_MAX,
4099         .policy         = vxlan_policy,
4100         .priv_size      = sizeof(struct vxlan_dev),
4101         .setup          = vxlan_setup,
4102         .validate       = vxlan_validate,
4103         .newlink        = vxlan_newlink,
4104         .changelink     = vxlan_changelink,
4105         .dellink        = vxlan_dellink,
4106         .get_size       = vxlan_get_size,
4107         .fill_info      = vxlan_fill_info,
4108         .get_link_net   = vxlan_get_link_net,
4109 };
4110
4111 struct net_device *vxlan_dev_create(struct net *net, const char *name,
4112                                     u8 name_assign_type,
4113                                     struct vxlan_config *conf)
4114 {
4115         struct nlattr *tb[IFLA_MAX + 1];
4116         struct net_device *dev;
4117         int err;
4118
4119         memset(&tb, 0, sizeof(tb));
4120
4121         dev = rtnl_create_link(net, name, name_assign_type,
4122                                &vxlan_link_ops, tb, NULL);
4123         if (IS_ERR(dev))
4124                 return dev;
4125
4126         err = __vxlan_dev_create(net, dev, conf, NULL);
4127         if (err < 0) {
4128                 free_netdev(dev);
4129                 return ERR_PTR(err);
4130         }
4131
4132         err = rtnl_configure_link(dev, NULL);
4133         if (err < 0) {
4134                 LIST_HEAD(list_kill);
4135
4136                 vxlan_dellink(dev, &list_kill);
4137                 unregister_netdevice_many(&list_kill);
4138                 return ERR_PTR(err);
4139         }
4140
4141         return dev;
4142 }
4143 EXPORT_SYMBOL_GPL(vxlan_dev_create);
4144
4145 static void vxlan_handle_lowerdev_unregister(struct vxlan_net *vn,
4146                                              struct net_device *dev)
4147 {
4148         struct vxlan_dev *vxlan, *next;
4149         LIST_HEAD(list_kill);
4150
4151         list_for_each_entry_safe(vxlan, next, &vn->vxlan_list, next) {
4152                 struct vxlan_rdst *dst = &vxlan->default_dst;
4153
4154                 /* In case we created vxlan device with carrier
4155                  * and we loose the carrier due to module unload
4156                  * we also need to remove vxlan device. In other
4157                  * cases, it's not necessary and remote_ifindex
4158                  * is 0 here, so no matches.
4159                  */
4160                 if (dst->remote_ifindex == dev->ifindex)
4161                         vxlan_dellink(vxlan->dev, &list_kill);
4162         }
4163
4164         unregister_netdevice_many(&list_kill);
4165 }
4166
4167 static int vxlan_netdevice_event(struct notifier_block *unused,
4168                                  unsigned long event, void *ptr)
4169 {
4170         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4171         struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
4172
4173         if (event == NETDEV_UNREGISTER) {
4174                 vxlan_offload_rx_ports(dev, false);
4175                 vxlan_handle_lowerdev_unregister(vn, dev);
4176         } else if (event == NETDEV_REGISTER) {
4177                 vxlan_offload_rx_ports(dev, true);
4178         } else if (event == NETDEV_UDP_TUNNEL_PUSH_INFO ||
4179                    event == NETDEV_UDP_TUNNEL_DROP_INFO) {
4180                 vxlan_offload_rx_ports(dev, event == NETDEV_UDP_TUNNEL_PUSH_INFO);
4181         }
4182
4183         return NOTIFY_DONE;
4184 }
4185
4186 static struct notifier_block vxlan_notifier_block __read_mostly = {
4187         .notifier_call = vxlan_netdevice_event,
4188 };
4189
4190 static void
4191 vxlan_fdb_offloaded_set(struct net_device *dev,
4192                         struct switchdev_notifier_vxlan_fdb_info *fdb_info)
4193 {
4194         struct vxlan_dev *vxlan = netdev_priv(dev);
4195         struct vxlan_rdst *rdst;
4196         struct vxlan_fdb *f;
4197
4198         spin_lock_bh(&vxlan->hash_lock);
4199
4200         f = vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni);
4201         if (!f)
4202                 goto out;
4203
4204         rdst = vxlan_fdb_find_rdst(f, &fdb_info->remote_ip,
4205                                    fdb_info->remote_port,
4206                                    fdb_info->remote_vni,
4207                                    fdb_info->remote_ifindex);
4208         if (!rdst)
4209                 goto out;
4210
4211         rdst->offloaded = fdb_info->offloaded;
4212
4213 out:
4214         spin_unlock_bh(&vxlan->hash_lock);
4215 }
4216
4217 static int
4218 vxlan_fdb_external_learn_add(struct net_device *dev,
4219                              struct switchdev_notifier_vxlan_fdb_info *fdb_info)
4220 {
4221         struct vxlan_dev *vxlan = netdev_priv(dev);
4222         struct netlink_ext_ack *extack;
4223         int err;
4224
4225         extack = switchdev_notifier_info_to_extack(&fdb_info->info);
4226
4227         spin_lock_bh(&vxlan->hash_lock);
4228         err = vxlan_fdb_update(vxlan, fdb_info->eth_addr, &fdb_info->remote_ip,
4229                                NUD_REACHABLE,
4230                                NLM_F_CREATE | NLM_F_REPLACE,
4231                                fdb_info->remote_port,
4232                                fdb_info->vni,
4233                                fdb_info->remote_vni,
4234                                fdb_info->remote_ifindex,
4235                                NTF_USE | NTF_SELF | NTF_EXT_LEARNED,
4236                                false, extack);
4237         spin_unlock_bh(&vxlan->hash_lock);
4238
4239         return err;
4240 }
4241
4242 static int
4243 vxlan_fdb_external_learn_del(struct net_device *dev,
4244                              struct switchdev_notifier_vxlan_fdb_info *fdb_info)
4245 {
4246         struct vxlan_dev *vxlan = netdev_priv(dev);
4247         struct vxlan_fdb *f;
4248         int err = 0;
4249
4250         spin_lock_bh(&vxlan->hash_lock);
4251
4252         f = vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni);
4253         if (!f)
4254                 err = -ENOENT;
4255         else if (f->flags & NTF_EXT_LEARNED)
4256                 err = __vxlan_fdb_delete(vxlan, fdb_info->eth_addr,
4257                                          fdb_info->remote_ip,
4258                                          fdb_info->remote_port,
4259                                          fdb_info->vni,
4260                                          fdb_info->remote_vni,
4261                                          fdb_info->remote_ifindex,
4262                                          false);
4263
4264         spin_unlock_bh(&vxlan->hash_lock);
4265
4266         return err;
4267 }
4268
4269 static int vxlan_switchdev_event(struct notifier_block *unused,
4270                                  unsigned long event, void *ptr)
4271 {
4272         struct net_device *dev = switchdev_notifier_info_to_dev(ptr);
4273         struct switchdev_notifier_vxlan_fdb_info *fdb_info;
4274         int err = 0;
4275
4276         switch (event) {
4277         case SWITCHDEV_VXLAN_FDB_OFFLOADED:
4278                 vxlan_fdb_offloaded_set(dev, ptr);
4279                 break;
4280         case SWITCHDEV_VXLAN_FDB_ADD_TO_BRIDGE:
4281                 fdb_info = ptr;
4282                 err = vxlan_fdb_external_learn_add(dev, fdb_info);
4283                 if (err) {
4284                         err = notifier_from_errno(err);
4285                         break;
4286                 }
4287                 fdb_info->offloaded = true;
4288                 vxlan_fdb_offloaded_set(dev, fdb_info);
4289                 break;
4290         case SWITCHDEV_VXLAN_FDB_DEL_TO_BRIDGE:
4291                 fdb_info = ptr;
4292                 err = vxlan_fdb_external_learn_del(dev, fdb_info);
4293                 if (err) {
4294                         err = notifier_from_errno(err);
4295                         break;
4296                 }
4297                 fdb_info->offloaded = false;
4298                 vxlan_fdb_offloaded_set(dev, fdb_info);
4299                 break;
4300         }
4301
4302         return err;
4303 }
4304
4305 static struct notifier_block vxlan_switchdev_notifier_block __read_mostly = {
4306         .notifier_call = vxlan_switchdev_event,
4307 };
4308
4309 static __net_init int vxlan_init_net(struct net *net)
4310 {
4311         struct vxlan_net *vn = net_generic(net, vxlan_net_id);
4312         unsigned int h;
4313
4314         INIT_LIST_HEAD(&vn->vxlan_list);
4315         spin_lock_init(&vn->sock_lock);
4316
4317         for (h = 0; h < PORT_HASH_SIZE; ++h)
4318                 INIT_HLIST_HEAD(&vn->sock_list[h]);
4319
4320         return 0;
4321 }
4322
4323 static void vxlan_destroy_tunnels(struct net *net, struct list_head *head)
4324 {
4325         struct vxlan_net *vn = net_generic(net, vxlan_net_id);
4326         struct vxlan_dev *vxlan, *next;
4327         struct net_device *dev, *aux;
4328         unsigned int h;
4329
4330         for_each_netdev_safe(net, dev, aux)
4331                 if (dev->rtnl_link_ops == &vxlan_link_ops)
4332                         unregister_netdevice_queue(dev, head);
4333
4334         list_for_each_entry_safe(vxlan, next, &vn->vxlan_list, next) {
4335                 /* If vxlan->dev is in the same netns, it has already been added
4336                  * to the list by the previous loop.
4337                  */
4338                 if (!net_eq(dev_net(vxlan->dev), net)) {
4339                         gro_cells_destroy(&vxlan->gro_cells);
4340                         unregister_netdevice_queue(vxlan->dev, head);
4341                 }
4342         }
4343
4344         for (h = 0; h < PORT_HASH_SIZE; ++h)
4345                 WARN_ON_ONCE(!hlist_empty(&vn->sock_list[h]));
4346 }
4347
4348 static void __net_exit vxlan_exit_batch_net(struct list_head *net_list)
4349 {
4350         struct net *net;
4351         LIST_HEAD(list);
4352
4353         rtnl_lock();
4354         list_for_each_entry(net, net_list, exit_list)
4355                 vxlan_destroy_tunnels(net, &list);
4356
4357         unregister_netdevice_many(&list);
4358         rtnl_unlock();
4359 }
4360
4361 static struct pernet_operations vxlan_net_ops = {
4362         .init = vxlan_init_net,
4363         .exit_batch = vxlan_exit_batch_net,
4364         .id   = &vxlan_net_id,
4365         .size = sizeof(struct vxlan_net),
4366 };
4367
4368 static int __init vxlan_init_module(void)
4369 {
4370         int rc;
4371
4372         get_random_bytes(&vxlan_salt, sizeof(vxlan_salt));
4373
4374         rc = register_pernet_subsys(&vxlan_net_ops);
4375         if (rc)
4376                 goto out1;
4377
4378         rc = register_netdevice_notifier(&vxlan_notifier_block);
4379         if (rc)
4380                 goto out2;
4381
4382         rc = register_switchdev_notifier(&vxlan_switchdev_notifier_block);
4383         if (rc)
4384                 goto out3;
4385
4386         rc = rtnl_link_register(&vxlan_link_ops);
4387         if (rc)
4388                 goto out4;
4389
4390         return 0;
4391 out4:
4392         unregister_switchdev_notifier(&vxlan_switchdev_notifier_block);
4393 out3:
4394         unregister_netdevice_notifier(&vxlan_notifier_block);
4395 out2:
4396         unregister_pernet_subsys(&vxlan_net_ops);
4397 out1:
4398         return rc;
4399 }
4400 late_initcall(vxlan_init_module);
4401
4402 static void __exit vxlan_cleanup_module(void)
4403 {
4404         rtnl_link_unregister(&vxlan_link_ops);
4405         unregister_switchdev_notifier(&vxlan_switchdev_notifier_block);
4406         unregister_netdevice_notifier(&vxlan_notifier_block);
4407         unregister_pernet_subsys(&vxlan_net_ops);
4408         /* rcu_barrier() is called by netns */
4409 }
4410 module_exit(vxlan_cleanup_module);
4411
4412 MODULE_LICENSE("GPL");
4413 MODULE_VERSION(VXLAN_VERSION);
4414 MODULE_AUTHOR("Stephen Hemminger <stephen@networkplumber.org>");
4415 MODULE_DESCRIPTION("Driver for VXLAN encapsulated traffic");
4416 MODULE_ALIAS_RTNL_LINK("vxlan");