OSDN Git Service

clk: at91: fix masterck name
[uclinux-h8/linux.git] / drivers / net / vxlan.c
1 /*
2  * VXLAN: Virtual eXtensible Local Area Network
3  *
4  * Copyright (c) 2012-2013 Vyatta Inc.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 as
8  * published by the Free Software Foundation.
9  */
10
11 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
12
13 #include <linux/kernel.h>
14 #include <linux/module.h>
15 #include <linux/errno.h>
16 #include <linux/slab.h>
17 #include <linux/udp.h>
18 #include <linux/igmp.h>
19 #include <linux/if_ether.h>
20 #include <linux/ethtool.h>
21 #include <net/arp.h>
22 #include <net/ndisc.h>
23 #include <net/ip.h>
24 #include <net/icmp.h>
25 #include <net/rtnetlink.h>
26 #include <net/inet_ecn.h>
27 #include <net/net_namespace.h>
28 #include <net/netns/generic.h>
29 #include <net/tun_proto.h>
30 #include <net/vxlan.h>
31
32 #if IS_ENABLED(CONFIG_IPV6)
33 #include <net/ip6_tunnel.h>
34 #include <net/ip6_checksum.h>
35 #endif
36
37 #define VXLAN_VERSION   "0.1"
38
39 #define PORT_HASH_BITS  8
40 #define PORT_HASH_SIZE  (1<<PORT_HASH_BITS)
41 #define FDB_AGE_DEFAULT 300 /* 5 min */
42 #define FDB_AGE_INTERVAL (10 * HZ)      /* rescan interval */
43
44 /* UDP port for VXLAN traffic.
45  * The IANA assigned port is 4789, but the Linux default is 8472
46  * for compatibility with early adopters.
47  */
48 static unsigned short vxlan_port __read_mostly = 8472;
49 module_param_named(udp_port, vxlan_port, ushort, 0444);
50 MODULE_PARM_DESC(udp_port, "Destination UDP port");
51
52 static bool log_ecn_error = true;
53 module_param(log_ecn_error, bool, 0644);
54 MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
55
56 static unsigned int vxlan_net_id;
57 static struct rtnl_link_ops vxlan_link_ops;
58
59 static const u8 all_zeros_mac[ETH_ALEN + 2];
60
61 static int vxlan_sock_add(struct vxlan_dev *vxlan);
62
63 static void vxlan_vs_del_dev(struct vxlan_dev *vxlan);
64
65 /* per-network namespace private data for this module */
66 struct vxlan_net {
67         struct list_head  vxlan_list;
68         struct hlist_head sock_list[PORT_HASH_SIZE];
69         spinlock_t        sock_lock;
70 };
71
72 /* Forwarding table entry */
73 struct vxlan_fdb {
74         struct hlist_node hlist;        /* linked list of entries */
75         struct rcu_head   rcu;
76         unsigned long     updated;      /* jiffies */
77         unsigned long     used;
78         struct list_head  remotes;
79         u8                eth_addr[ETH_ALEN];
80         u16               state;        /* see ndm_state */
81         __be32            vni;
82         u16               flags;        /* see ndm_flags and below */
83 };
84
85 #define NTF_VXLAN_ADDED_BY_USER 0x100
86
87 /* salt for hash table */
88 static u32 vxlan_salt __read_mostly;
89
90 static inline bool vxlan_collect_metadata(struct vxlan_sock *vs)
91 {
92         return vs->flags & VXLAN_F_COLLECT_METADATA ||
93                ip_tunnel_collect_metadata();
94 }
95
96 #if IS_ENABLED(CONFIG_IPV6)
97 static inline
98 bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b)
99 {
100         if (a->sa.sa_family != b->sa.sa_family)
101                 return false;
102         if (a->sa.sa_family == AF_INET6)
103                 return ipv6_addr_equal(&a->sin6.sin6_addr, &b->sin6.sin6_addr);
104         else
105                 return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr;
106 }
107
108 static int vxlan_nla_get_addr(union vxlan_addr *ip, struct nlattr *nla)
109 {
110         if (nla_len(nla) >= sizeof(struct in6_addr)) {
111                 ip->sin6.sin6_addr = nla_get_in6_addr(nla);
112                 ip->sa.sa_family = AF_INET6;
113                 return 0;
114         } else if (nla_len(nla) >= sizeof(__be32)) {
115                 ip->sin.sin_addr.s_addr = nla_get_in_addr(nla);
116                 ip->sa.sa_family = AF_INET;
117                 return 0;
118         } else {
119                 return -EAFNOSUPPORT;
120         }
121 }
122
123 static int vxlan_nla_put_addr(struct sk_buff *skb, int attr,
124                               const union vxlan_addr *ip)
125 {
126         if (ip->sa.sa_family == AF_INET6)
127                 return nla_put_in6_addr(skb, attr, &ip->sin6.sin6_addr);
128         else
129                 return nla_put_in_addr(skb, attr, ip->sin.sin_addr.s_addr);
130 }
131
132 #else /* !CONFIG_IPV6 */
133
134 static inline
135 bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b)
136 {
137         return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr;
138 }
139
140 static int vxlan_nla_get_addr(union vxlan_addr *ip, struct nlattr *nla)
141 {
142         if (nla_len(nla) >= sizeof(struct in6_addr)) {
143                 return -EAFNOSUPPORT;
144         } else if (nla_len(nla) >= sizeof(__be32)) {
145                 ip->sin.sin_addr.s_addr = nla_get_in_addr(nla);
146                 ip->sa.sa_family = AF_INET;
147                 return 0;
148         } else {
149                 return -EAFNOSUPPORT;
150         }
151 }
152
153 static int vxlan_nla_put_addr(struct sk_buff *skb, int attr,
154                               const union vxlan_addr *ip)
155 {
156         return nla_put_in_addr(skb, attr, ip->sin.sin_addr.s_addr);
157 }
158 #endif
159
160 /* Virtual Network hash table head */
161 static inline struct hlist_head *vni_head(struct vxlan_sock *vs, __be32 vni)
162 {
163         return &vs->vni_list[hash_32((__force u32)vni, VNI_HASH_BITS)];
164 }
165
166 /* Socket hash table head */
167 static inline struct hlist_head *vs_head(struct net *net, __be16 port)
168 {
169         struct vxlan_net *vn = net_generic(net, vxlan_net_id);
170
171         return &vn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)];
172 }
173
174 /* First remote destination for a forwarding entry.
175  * Guaranteed to be non-NULL because remotes are never deleted.
176  */
177 static inline struct vxlan_rdst *first_remote_rcu(struct vxlan_fdb *fdb)
178 {
179         return list_entry_rcu(fdb->remotes.next, struct vxlan_rdst, list);
180 }
181
182 static inline struct vxlan_rdst *first_remote_rtnl(struct vxlan_fdb *fdb)
183 {
184         return list_first_entry(&fdb->remotes, struct vxlan_rdst, list);
185 }
186
187 /* Find VXLAN socket based on network namespace, address family and UDP port
188  * and enabled unshareable flags.
189  */
190 static struct vxlan_sock *vxlan_find_sock(struct net *net, sa_family_t family,
191                                           __be16 port, u32 flags, int ifindex)
192 {
193         struct vxlan_sock *vs;
194
195         flags &= VXLAN_F_RCV_FLAGS;
196
197         hlist_for_each_entry_rcu(vs, vs_head(net, port), hlist) {
198                 if (inet_sk(vs->sock->sk)->inet_sport == port &&
199                     vxlan_get_sk_family(vs) == family &&
200                     vs->flags == flags &&
201                     vs->sock->sk->sk_bound_dev_if == ifindex)
202                         return vs;
203         }
204         return NULL;
205 }
206
207 static struct vxlan_dev *vxlan_vs_find_vni(struct vxlan_sock *vs, int ifindex,
208                                            __be32 vni)
209 {
210         struct vxlan_dev_node *node;
211
212         /* For flow based devices, map all packets to VNI 0 */
213         if (vs->flags & VXLAN_F_COLLECT_METADATA)
214                 vni = 0;
215
216         hlist_for_each_entry_rcu(node, vni_head(vs, vni), hlist) {
217                 if (node->vxlan->default_dst.remote_vni != vni)
218                         continue;
219
220                 if (IS_ENABLED(CONFIG_IPV6)) {
221                         const struct vxlan_config *cfg = &node->vxlan->cfg;
222
223                         if ((cfg->flags & VXLAN_F_IPV6_LINKLOCAL) &&
224                             cfg->remote_ifindex != ifindex)
225                                 continue;
226                 }
227
228                 return node->vxlan;
229         }
230
231         return NULL;
232 }
233
234 /* Look up VNI in a per net namespace table */
235 static struct vxlan_dev *vxlan_find_vni(struct net *net, int ifindex,
236                                         __be32 vni, sa_family_t family,
237                                         __be16 port, u32 flags)
238 {
239         struct vxlan_sock *vs;
240
241         vs = vxlan_find_sock(net, family, port, flags, ifindex);
242         if (!vs)
243                 return NULL;
244
245         return vxlan_vs_find_vni(vs, ifindex, vni);
246 }
247
248 /* Fill in neighbour message in skbuff. */
249 static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
250                           const struct vxlan_fdb *fdb,
251                           u32 portid, u32 seq, int type, unsigned int flags,
252                           const struct vxlan_rdst *rdst)
253 {
254         unsigned long now = jiffies;
255         struct nda_cacheinfo ci;
256         struct nlmsghdr *nlh;
257         struct ndmsg *ndm;
258         bool send_ip, send_eth;
259
260         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags);
261         if (nlh == NULL)
262                 return -EMSGSIZE;
263
264         ndm = nlmsg_data(nlh);
265         memset(ndm, 0, sizeof(*ndm));
266
267         send_eth = send_ip = true;
268
269         if (type == RTM_GETNEIGH) {
270                 send_ip = !vxlan_addr_any(&rdst->remote_ip);
271                 send_eth = !is_zero_ether_addr(fdb->eth_addr);
272                 ndm->ndm_family = send_ip ? rdst->remote_ip.sa.sa_family : AF_INET;
273         } else
274                 ndm->ndm_family = AF_BRIDGE;
275         ndm->ndm_state = fdb->state;
276         ndm->ndm_ifindex = vxlan->dev->ifindex;
277         ndm->ndm_flags = fdb->flags;
278         if (rdst->offloaded)
279                 ndm->ndm_flags |= NTF_OFFLOADED;
280         ndm->ndm_type = RTN_UNICAST;
281
282         if (!net_eq(dev_net(vxlan->dev), vxlan->net) &&
283             nla_put_s32(skb, NDA_LINK_NETNSID,
284                         peernet2id(dev_net(vxlan->dev), vxlan->net)))
285                 goto nla_put_failure;
286
287         if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr))
288                 goto nla_put_failure;
289
290         if (send_ip && vxlan_nla_put_addr(skb, NDA_DST, &rdst->remote_ip))
291                 goto nla_put_failure;
292
293         if (rdst->remote_port && rdst->remote_port != vxlan->cfg.dst_port &&
294             nla_put_be16(skb, NDA_PORT, rdst->remote_port))
295                 goto nla_put_failure;
296         if (rdst->remote_vni != vxlan->default_dst.remote_vni &&
297             nla_put_u32(skb, NDA_VNI, be32_to_cpu(rdst->remote_vni)))
298                 goto nla_put_failure;
299         if ((vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) && fdb->vni &&
300             nla_put_u32(skb, NDA_SRC_VNI,
301                         be32_to_cpu(fdb->vni)))
302                 goto nla_put_failure;
303         if (rdst->remote_ifindex &&
304             nla_put_u32(skb, NDA_IFINDEX, rdst->remote_ifindex))
305                 goto nla_put_failure;
306
307         ci.ndm_used      = jiffies_to_clock_t(now - fdb->used);
308         ci.ndm_confirmed = 0;
309         ci.ndm_updated   = jiffies_to_clock_t(now - fdb->updated);
310         ci.ndm_refcnt    = 0;
311
312         if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci))
313                 goto nla_put_failure;
314
315         nlmsg_end(skb, nlh);
316         return 0;
317
318 nla_put_failure:
319         nlmsg_cancel(skb, nlh);
320         return -EMSGSIZE;
321 }
322
323 static inline size_t vxlan_nlmsg_size(void)
324 {
325         return NLMSG_ALIGN(sizeof(struct ndmsg))
326                 + nla_total_size(ETH_ALEN) /* NDA_LLADDR */
327                 + nla_total_size(sizeof(struct in6_addr)) /* NDA_DST */
328                 + nla_total_size(sizeof(__be16)) /* NDA_PORT */
329                 + nla_total_size(sizeof(__be32)) /* NDA_VNI */
330                 + nla_total_size(sizeof(__u32)) /* NDA_IFINDEX */
331                 + nla_total_size(sizeof(__s32)) /* NDA_LINK_NETNSID */
332                 + nla_total_size(sizeof(struct nda_cacheinfo));
333 }
334
335 static void __vxlan_fdb_notify(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb,
336                                struct vxlan_rdst *rd, int type)
337 {
338         struct net *net = dev_net(vxlan->dev);
339         struct sk_buff *skb;
340         int err = -ENOBUFS;
341
342         skb = nlmsg_new(vxlan_nlmsg_size(), GFP_ATOMIC);
343         if (skb == NULL)
344                 goto errout;
345
346         err = vxlan_fdb_info(skb, vxlan, fdb, 0, 0, type, 0, rd);
347         if (err < 0) {
348                 /* -EMSGSIZE implies BUG in vxlan_nlmsg_size() */
349                 WARN_ON(err == -EMSGSIZE);
350                 kfree_skb(skb);
351                 goto errout;
352         }
353
354         rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
355         return;
356 errout:
357         if (err < 0)
358                 rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
359 }
360
361 static void vxlan_fdb_switchdev_notifier_info(const struct vxlan_dev *vxlan,
362                             const struct vxlan_fdb *fdb,
363                             const struct vxlan_rdst *rd,
364                             struct switchdev_notifier_vxlan_fdb_info *fdb_info)
365 {
366         fdb_info->info.dev = vxlan->dev;
367         fdb_info->info.extack = NULL;
368         fdb_info->remote_ip = rd->remote_ip;
369         fdb_info->remote_port = rd->remote_port;
370         fdb_info->remote_vni = rd->remote_vni;
371         fdb_info->remote_ifindex = rd->remote_ifindex;
372         memcpy(fdb_info->eth_addr, fdb->eth_addr, ETH_ALEN);
373         fdb_info->vni = fdb->vni;
374         fdb_info->offloaded = rd->offloaded;
375         fdb_info->added_by_user = fdb->flags & NTF_VXLAN_ADDED_BY_USER;
376 }
377
378 static void vxlan_fdb_switchdev_call_notifiers(struct vxlan_dev *vxlan,
379                                                struct vxlan_fdb *fdb,
380                                                struct vxlan_rdst *rd,
381                                                bool adding)
382 {
383         struct switchdev_notifier_vxlan_fdb_info info;
384         enum switchdev_notifier_type notifier_type;
385
386         if (WARN_ON(!rd))
387                 return;
388
389         notifier_type = adding ? SWITCHDEV_VXLAN_FDB_ADD_TO_DEVICE
390                                : SWITCHDEV_VXLAN_FDB_DEL_TO_DEVICE;
391         vxlan_fdb_switchdev_notifier_info(vxlan, fdb, rd, &info);
392         call_switchdev_notifiers(notifier_type, vxlan->dev,
393                                  &info.info);
394 }
395
396 static void vxlan_fdb_notify(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb,
397                              struct vxlan_rdst *rd, int type, bool swdev_notify)
398 {
399         if (swdev_notify) {
400                 switch (type) {
401                 case RTM_NEWNEIGH:
402                         vxlan_fdb_switchdev_call_notifiers(vxlan, fdb, rd,
403                                                            true);
404                         break;
405                 case RTM_DELNEIGH:
406                         vxlan_fdb_switchdev_call_notifiers(vxlan, fdb, rd,
407                                                            false);
408                         break;
409                 }
410         }
411
412         __vxlan_fdb_notify(vxlan, fdb, rd, type);
413 }
414
415 static void vxlan_ip_miss(struct net_device *dev, union vxlan_addr *ipa)
416 {
417         struct vxlan_dev *vxlan = netdev_priv(dev);
418         struct vxlan_fdb f = {
419                 .state = NUD_STALE,
420         };
421         struct vxlan_rdst remote = {
422                 .remote_ip = *ipa, /* goes to NDA_DST */
423                 .remote_vni = cpu_to_be32(VXLAN_N_VID),
424         };
425
426         vxlan_fdb_notify(vxlan, &f, &remote, RTM_GETNEIGH, true);
427 }
428
429 static void vxlan_fdb_miss(struct vxlan_dev *vxlan, const u8 eth_addr[ETH_ALEN])
430 {
431         struct vxlan_fdb f = {
432                 .state = NUD_STALE,
433         };
434         struct vxlan_rdst remote = { };
435
436         memcpy(f.eth_addr, eth_addr, ETH_ALEN);
437
438         vxlan_fdb_notify(vxlan, &f, &remote, RTM_GETNEIGH, true);
439 }
440
441 /* Hash Ethernet address */
442 static u32 eth_hash(const unsigned char *addr)
443 {
444         u64 value = get_unaligned((u64 *)addr);
445
446         /* only want 6 bytes */
447 #ifdef __BIG_ENDIAN
448         value >>= 16;
449 #else
450         value <<= 16;
451 #endif
452         return hash_64(value, FDB_HASH_BITS);
453 }
454
455 static u32 eth_vni_hash(const unsigned char *addr, __be32 vni)
456 {
457         /* use 1 byte of OUI and 3 bytes of NIC */
458         u32 key = get_unaligned((u32 *)(addr + 2));
459
460         return jhash_2words(key, vni, vxlan_salt) & (FDB_HASH_SIZE - 1);
461 }
462
463 /* Hash chain to use given mac address */
464 static inline struct hlist_head *vxlan_fdb_head(struct vxlan_dev *vxlan,
465                                                 const u8 *mac, __be32 vni)
466 {
467         if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA)
468                 return &vxlan->fdb_head[eth_vni_hash(mac, vni)];
469         else
470                 return &vxlan->fdb_head[eth_hash(mac)];
471 }
472
473 /* Look up Ethernet address in forwarding table */
474 static struct vxlan_fdb *__vxlan_find_mac(struct vxlan_dev *vxlan,
475                                           const u8 *mac, __be32 vni)
476 {
477         struct hlist_head *head = vxlan_fdb_head(vxlan, mac, vni);
478         struct vxlan_fdb *f;
479
480         hlist_for_each_entry_rcu(f, head, hlist) {
481                 if (ether_addr_equal(mac, f->eth_addr)) {
482                         if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) {
483                                 if (vni == f->vni)
484                                         return f;
485                         } else {
486                                 return f;
487                         }
488                 }
489         }
490
491         return NULL;
492 }
493
494 static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan,
495                                         const u8 *mac, __be32 vni)
496 {
497         struct vxlan_fdb *f;
498
499         f = __vxlan_find_mac(vxlan, mac, vni);
500         if (f && f->used != jiffies)
501                 f->used = jiffies;
502
503         return f;
504 }
505
506 /* caller should hold vxlan->hash_lock */
507 static struct vxlan_rdst *vxlan_fdb_find_rdst(struct vxlan_fdb *f,
508                                               union vxlan_addr *ip, __be16 port,
509                                               __be32 vni, __u32 ifindex)
510 {
511         struct vxlan_rdst *rd;
512
513         list_for_each_entry(rd, &f->remotes, list) {
514                 if (vxlan_addr_equal(&rd->remote_ip, ip) &&
515                     rd->remote_port == port &&
516                     rd->remote_vni == vni &&
517                     rd->remote_ifindex == ifindex)
518                         return rd;
519         }
520
521         return NULL;
522 }
523
524 int vxlan_fdb_find_uc(struct net_device *dev, const u8 *mac, __be32 vni,
525                       struct switchdev_notifier_vxlan_fdb_info *fdb_info)
526 {
527         struct vxlan_dev *vxlan = netdev_priv(dev);
528         u8 eth_addr[ETH_ALEN + 2] = { 0 };
529         struct vxlan_rdst *rdst;
530         struct vxlan_fdb *f;
531         int rc = 0;
532
533         if (is_multicast_ether_addr(mac) ||
534             is_zero_ether_addr(mac))
535                 return -EINVAL;
536
537         ether_addr_copy(eth_addr, mac);
538
539         rcu_read_lock();
540
541         f = __vxlan_find_mac(vxlan, eth_addr, vni);
542         if (!f) {
543                 rc = -ENOENT;
544                 goto out;
545         }
546
547         rdst = first_remote_rcu(f);
548         vxlan_fdb_switchdev_notifier_info(vxlan, f, rdst, fdb_info);
549
550 out:
551         rcu_read_unlock();
552         return rc;
553 }
554 EXPORT_SYMBOL_GPL(vxlan_fdb_find_uc);
555
556 static int vxlan_fdb_notify_one(struct notifier_block *nb,
557                                 const struct vxlan_dev *vxlan,
558                                 const struct vxlan_fdb *f,
559                                 const struct vxlan_rdst *rdst)
560 {
561         struct switchdev_notifier_vxlan_fdb_info fdb_info;
562         int rc;
563
564         vxlan_fdb_switchdev_notifier_info(vxlan, f, rdst, &fdb_info);
565         rc = nb->notifier_call(nb, SWITCHDEV_VXLAN_FDB_ADD_TO_DEVICE,
566                                &fdb_info);
567         return notifier_to_errno(rc);
568 }
569
570 int vxlan_fdb_replay(const struct net_device *dev, __be32 vni,
571                      struct notifier_block *nb)
572 {
573         struct vxlan_dev *vxlan;
574         struct vxlan_rdst *rdst;
575         struct vxlan_fdb *f;
576         unsigned int h;
577         int rc = 0;
578
579         if (!netif_is_vxlan(dev))
580                 return -EINVAL;
581         vxlan = netdev_priv(dev);
582
583         spin_lock_bh(&vxlan->hash_lock);
584         for (h = 0; h < FDB_HASH_SIZE; ++h) {
585                 hlist_for_each_entry(f, &vxlan->fdb_head[h], hlist) {
586                         if (f->vni == vni) {
587                                 list_for_each_entry(rdst, &f->remotes, list) {
588                                         rc = vxlan_fdb_notify_one(nb, vxlan,
589                                                                   f, rdst);
590                                         if (rc)
591                                                 goto out;
592                                 }
593                         }
594                 }
595         }
596
597 out:
598         spin_unlock_bh(&vxlan->hash_lock);
599         return rc;
600 }
601 EXPORT_SYMBOL_GPL(vxlan_fdb_replay);
602
603 void vxlan_fdb_clear_offload(const struct net_device *dev, __be32 vni)
604 {
605         struct vxlan_dev *vxlan;
606         struct vxlan_rdst *rdst;
607         struct vxlan_fdb *f;
608         unsigned int h;
609
610         if (!netif_is_vxlan(dev))
611                 return;
612         vxlan = netdev_priv(dev);
613
614         spin_lock_bh(&vxlan->hash_lock);
615         for (h = 0; h < FDB_HASH_SIZE; ++h) {
616                 hlist_for_each_entry(f, &vxlan->fdb_head[h], hlist)
617                         if (f->vni == vni)
618                                 list_for_each_entry(rdst, &f->remotes, list)
619                                         rdst->offloaded = false;
620         }
621         spin_unlock_bh(&vxlan->hash_lock);
622 }
623 EXPORT_SYMBOL_GPL(vxlan_fdb_clear_offload);
624
625 /* Replace destination of unicast mac */
626 static int vxlan_fdb_replace(struct vxlan_fdb *f,
627                              union vxlan_addr *ip, __be16 port, __be32 vni,
628                              __u32 ifindex)
629 {
630         struct vxlan_rdst *rd;
631
632         rd = vxlan_fdb_find_rdst(f, ip, port, vni, ifindex);
633         if (rd)
634                 return 0;
635
636         rd = list_first_entry_or_null(&f->remotes, struct vxlan_rdst, list);
637         if (!rd)
638                 return 0;
639
640         dst_cache_reset(&rd->dst_cache);
641         rd->remote_ip = *ip;
642         rd->remote_port = port;
643         rd->remote_vni = vni;
644         rd->remote_ifindex = ifindex;
645         rd->offloaded = false;
646         return 1;
647 }
648
649 /* Add/update destinations for multicast */
650 static int vxlan_fdb_append(struct vxlan_fdb *f,
651                             union vxlan_addr *ip, __be16 port, __be32 vni,
652                             __u32 ifindex, struct vxlan_rdst **rdp)
653 {
654         struct vxlan_rdst *rd;
655
656         rd = vxlan_fdb_find_rdst(f, ip, port, vni, ifindex);
657         if (rd)
658                 return 0;
659
660         rd = kmalloc(sizeof(*rd), GFP_ATOMIC);
661         if (rd == NULL)
662                 return -ENOBUFS;
663
664         if (dst_cache_init(&rd->dst_cache, GFP_ATOMIC)) {
665                 kfree(rd);
666                 return -ENOBUFS;
667         }
668
669         rd->remote_ip = *ip;
670         rd->remote_port = port;
671         rd->offloaded = false;
672         rd->remote_vni = vni;
673         rd->remote_ifindex = ifindex;
674
675         list_add_tail_rcu(&rd->list, &f->remotes);
676
677         *rdp = rd;
678         return 1;
679 }
680
681 static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb,
682                                           unsigned int off,
683                                           struct vxlanhdr *vh, size_t hdrlen,
684                                           __be32 vni_field,
685                                           struct gro_remcsum *grc,
686                                           bool nopartial)
687 {
688         size_t start, offset;
689
690         if (skb->remcsum_offload)
691                 return vh;
692
693         if (!NAPI_GRO_CB(skb)->csum_valid)
694                 return NULL;
695
696         start = vxlan_rco_start(vni_field);
697         offset = start + vxlan_rco_offset(vni_field);
698
699         vh = skb_gro_remcsum_process(skb, (void *)vh, off, hdrlen,
700                                      start, offset, grc, nopartial);
701
702         skb->remcsum_offload = 1;
703
704         return vh;
705 }
706
707 static struct sk_buff *vxlan_gro_receive(struct sock *sk,
708                                          struct list_head *head,
709                                          struct sk_buff *skb)
710 {
711         struct sk_buff *pp = NULL;
712         struct sk_buff *p;
713         struct vxlanhdr *vh, *vh2;
714         unsigned int hlen, off_vx;
715         int flush = 1;
716         struct vxlan_sock *vs = rcu_dereference_sk_user_data(sk);
717         __be32 flags;
718         struct gro_remcsum grc;
719
720         skb_gro_remcsum_init(&grc);
721
722         off_vx = skb_gro_offset(skb);
723         hlen = off_vx + sizeof(*vh);
724         vh   = skb_gro_header_fast(skb, off_vx);
725         if (skb_gro_header_hard(skb, hlen)) {
726                 vh = skb_gro_header_slow(skb, hlen, off_vx);
727                 if (unlikely(!vh))
728                         goto out;
729         }
730
731         skb_gro_postpull_rcsum(skb, vh, sizeof(struct vxlanhdr));
732
733         flags = vh->vx_flags;
734
735         if ((flags & VXLAN_HF_RCO) && (vs->flags & VXLAN_F_REMCSUM_RX)) {
736                 vh = vxlan_gro_remcsum(skb, off_vx, vh, sizeof(struct vxlanhdr),
737                                        vh->vx_vni, &grc,
738                                        !!(vs->flags &
739                                           VXLAN_F_REMCSUM_NOPARTIAL));
740
741                 if (!vh)
742                         goto out;
743         }
744
745         skb_gro_pull(skb, sizeof(struct vxlanhdr)); /* pull vxlan header */
746
747         list_for_each_entry(p, head, list) {
748                 if (!NAPI_GRO_CB(p)->same_flow)
749                         continue;
750
751                 vh2 = (struct vxlanhdr *)(p->data + off_vx);
752                 if (vh->vx_flags != vh2->vx_flags ||
753                     vh->vx_vni != vh2->vx_vni) {
754                         NAPI_GRO_CB(p)->same_flow = 0;
755                         continue;
756                 }
757         }
758
759         pp = call_gro_receive(eth_gro_receive, head, skb);
760         flush = 0;
761
762 out:
763         skb_gro_flush_final_remcsum(skb, pp, flush, &grc);
764
765         return pp;
766 }
767
768 static int vxlan_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff)
769 {
770         /* Sets 'skb->inner_mac_header' since we are always called with
771          * 'skb->encapsulation' set.
772          */
773         return eth_gro_complete(skb, nhoff + sizeof(struct vxlanhdr));
774 }
775
776 static struct vxlan_fdb *vxlan_fdb_alloc(struct vxlan_dev *vxlan,
777                                          const u8 *mac, __u16 state,
778                                          __be32 src_vni, __u16 ndm_flags)
779 {
780         struct vxlan_fdb *f;
781
782         f = kmalloc(sizeof(*f), GFP_ATOMIC);
783         if (!f)
784                 return NULL;
785         f->state = state;
786         f->flags = ndm_flags;
787         f->updated = f->used = jiffies;
788         f->vni = src_vni;
789         INIT_LIST_HEAD(&f->remotes);
790         memcpy(f->eth_addr, mac, ETH_ALEN);
791
792         return f;
793 }
794
795 static int vxlan_fdb_create(struct vxlan_dev *vxlan,
796                             const u8 *mac, union vxlan_addr *ip,
797                             __u16 state, __be16 port, __be32 src_vni,
798                             __be32 vni, __u32 ifindex, __u16 ndm_flags,
799                             struct vxlan_fdb **fdb)
800 {
801         struct vxlan_rdst *rd = NULL;
802         struct vxlan_fdb *f;
803         int rc;
804
805         if (vxlan->cfg.addrmax &&
806             vxlan->addrcnt >= vxlan->cfg.addrmax)
807                 return -ENOSPC;
808
809         netdev_dbg(vxlan->dev, "add %pM -> %pIS\n", mac, ip);
810         f = vxlan_fdb_alloc(vxlan, mac, state, src_vni, ndm_flags);
811         if (!f)
812                 return -ENOMEM;
813
814         rc = vxlan_fdb_append(f, ip, port, vni, ifindex, &rd);
815         if (rc < 0) {
816                 kfree(f);
817                 return rc;
818         }
819
820         ++vxlan->addrcnt;
821         hlist_add_head_rcu(&f->hlist,
822                            vxlan_fdb_head(vxlan, mac, src_vni));
823
824         *fdb = f;
825
826         return 0;
827 }
828
829 /* Add new entry to forwarding table -- assumes lock held */
830 static int vxlan_fdb_update(struct vxlan_dev *vxlan,
831                             const u8 *mac, union vxlan_addr *ip,
832                             __u16 state, __u16 flags,
833                             __be16 port, __be32 src_vni, __be32 vni,
834                             __u32 ifindex, __u16 ndm_flags,
835                             bool swdev_notify)
836 {
837         __u16 fdb_flags = (ndm_flags & ~NTF_USE);
838         struct vxlan_rdst *rd = NULL;
839         struct vxlan_fdb *f;
840         int notify = 0;
841         int rc;
842
843         f = __vxlan_find_mac(vxlan, mac, src_vni);
844         if (f) {
845                 if (flags & NLM_F_EXCL) {
846                         netdev_dbg(vxlan->dev,
847                                    "lost race to create %pM\n", mac);
848                         return -EEXIST;
849                 }
850
851                 /* Do not allow an externally learned entry to take over an
852                  * entry added by the user.
853                  */
854                 if (!(fdb_flags & NTF_EXT_LEARNED) ||
855                     !(f->flags & NTF_VXLAN_ADDED_BY_USER)) {
856                         if (f->state != state) {
857                                 f->state = state;
858                                 f->updated = jiffies;
859                                 notify = 1;
860                         }
861                         if (f->flags != fdb_flags) {
862                                 f->flags = fdb_flags;
863                                 f->updated = jiffies;
864                                 notify = 1;
865                         }
866                 }
867
868                 if ((flags & NLM_F_REPLACE)) {
869                         /* Only change unicasts */
870                         if (!(is_multicast_ether_addr(f->eth_addr) ||
871                              is_zero_ether_addr(f->eth_addr))) {
872                                 notify |= vxlan_fdb_replace(f, ip, port, vni,
873                                                            ifindex);
874                         } else
875                                 return -EOPNOTSUPP;
876                 }
877                 if ((flags & NLM_F_APPEND) &&
878                     (is_multicast_ether_addr(f->eth_addr) ||
879                      is_zero_ether_addr(f->eth_addr))) {
880                         rc = vxlan_fdb_append(f, ip, port, vni, ifindex, &rd);
881
882                         if (rc < 0)
883                                 return rc;
884                         notify |= rc;
885                 }
886
887                 if (ndm_flags & NTF_USE)
888                         f->used = jiffies;
889         } else {
890                 if (!(flags & NLM_F_CREATE))
891                         return -ENOENT;
892
893                 /* Disallow replace to add a multicast entry */
894                 if ((flags & NLM_F_REPLACE) &&
895                     (is_multicast_ether_addr(mac) || is_zero_ether_addr(mac)))
896                         return -EOPNOTSUPP;
897
898                 netdev_dbg(vxlan->dev, "add %pM -> %pIS\n", mac, ip);
899                 rc = vxlan_fdb_create(vxlan, mac, ip, state, port, src_vni,
900                                       vni, ifindex, fdb_flags, &f);
901                 if (rc < 0)
902                         return rc;
903                 notify = 1;
904         }
905
906         if (notify) {
907                 if (rd == NULL)
908                         rd = first_remote_rtnl(f);
909                 vxlan_fdb_notify(vxlan, f, rd, RTM_NEWNEIGH, swdev_notify);
910         }
911
912         return 0;
913 }
914
915 static void vxlan_fdb_free(struct rcu_head *head)
916 {
917         struct vxlan_fdb *f = container_of(head, struct vxlan_fdb, rcu);
918         struct vxlan_rdst *rd, *nd;
919
920         list_for_each_entry_safe(rd, nd, &f->remotes, list) {
921                 dst_cache_destroy(&rd->dst_cache);
922                 kfree(rd);
923         }
924         kfree(f);
925 }
926
927 static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f,
928                               bool do_notify, bool swdev_notify)
929 {
930         struct vxlan_rdst *rd;
931
932         netdev_dbg(vxlan->dev,
933                     "delete %pM\n", f->eth_addr);
934
935         --vxlan->addrcnt;
936         if (do_notify)
937                 list_for_each_entry(rd, &f->remotes, list)
938                         vxlan_fdb_notify(vxlan, f, rd, RTM_DELNEIGH,
939                                          swdev_notify);
940
941         hlist_del_rcu(&f->hlist);
942         call_rcu(&f->rcu, vxlan_fdb_free);
943 }
944
945 static void vxlan_dst_free(struct rcu_head *head)
946 {
947         struct vxlan_rdst *rd = container_of(head, struct vxlan_rdst, rcu);
948
949         dst_cache_destroy(&rd->dst_cache);
950         kfree(rd);
951 }
952
953 static void vxlan_fdb_dst_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f,
954                                   struct vxlan_rdst *rd, bool swdev_notify)
955 {
956         list_del_rcu(&rd->list);
957         vxlan_fdb_notify(vxlan, f, rd, RTM_DELNEIGH, swdev_notify);
958         call_rcu(&rd->rcu, vxlan_dst_free);
959 }
960
961 static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan,
962                            union vxlan_addr *ip, __be16 *port, __be32 *src_vni,
963                            __be32 *vni, u32 *ifindex)
964 {
965         struct net *net = dev_net(vxlan->dev);
966         int err;
967
968         if (tb[NDA_DST]) {
969                 err = vxlan_nla_get_addr(ip, tb[NDA_DST]);
970                 if (err)
971                         return err;
972         } else {
973                 union vxlan_addr *remote = &vxlan->default_dst.remote_ip;
974                 if (remote->sa.sa_family == AF_INET) {
975                         ip->sin.sin_addr.s_addr = htonl(INADDR_ANY);
976                         ip->sa.sa_family = AF_INET;
977 #if IS_ENABLED(CONFIG_IPV6)
978                 } else {
979                         ip->sin6.sin6_addr = in6addr_any;
980                         ip->sa.sa_family = AF_INET6;
981 #endif
982                 }
983         }
984
985         if (tb[NDA_PORT]) {
986                 if (nla_len(tb[NDA_PORT]) != sizeof(__be16))
987                         return -EINVAL;
988                 *port = nla_get_be16(tb[NDA_PORT]);
989         } else {
990                 *port = vxlan->cfg.dst_port;
991         }
992
993         if (tb[NDA_VNI]) {
994                 if (nla_len(tb[NDA_VNI]) != sizeof(u32))
995                         return -EINVAL;
996                 *vni = cpu_to_be32(nla_get_u32(tb[NDA_VNI]));
997         } else {
998                 *vni = vxlan->default_dst.remote_vni;
999         }
1000
1001         if (tb[NDA_SRC_VNI]) {
1002                 if (nla_len(tb[NDA_SRC_VNI]) != sizeof(u32))
1003                         return -EINVAL;
1004                 *src_vni = cpu_to_be32(nla_get_u32(tb[NDA_SRC_VNI]));
1005         } else {
1006                 *src_vni = vxlan->default_dst.remote_vni;
1007         }
1008
1009         if (tb[NDA_IFINDEX]) {
1010                 struct net_device *tdev;
1011
1012                 if (nla_len(tb[NDA_IFINDEX]) != sizeof(u32))
1013                         return -EINVAL;
1014                 *ifindex = nla_get_u32(tb[NDA_IFINDEX]);
1015                 tdev = __dev_get_by_index(net, *ifindex);
1016                 if (!tdev)
1017                         return -EADDRNOTAVAIL;
1018         } else {
1019                 *ifindex = 0;
1020         }
1021
1022         return 0;
1023 }
1024
1025 /* Add static entry (via netlink) */
1026 static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
1027                          struct net_device *dev,
1028                          const unsigned char *addr, u16 vid, u16 flags)
1029 {
1030         struct vxlan_dev *vxlan = netdev_priv(dev);
1031         /* struct net *net = dev_net(vxlan->dev); */
1032         union vxlan_addr ip;
1033         __be16 port;
1034         __be32 src_vni, vni;
1035         u32 ifindex;
1036         int err;
1037
1038         if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_REACHABLE))) {
1039                 pr_info("RTM_NEWNEIGH with invalid state %#x\n",
1040                         ndm->ndm_state);
1041                 return -EINVAL;
1042         }
1043
1044         if (tb[NDA_DST] == NULL)
1045                 return -EINVAL;
1046
1047         err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex);
1048         if (err)
1049                 return err;
1050
1051         if (vxlan->default_dst.remote_ip.sa.sa_family != ip.sa.sa_family)
1052                 return -EAFNOSUPPORT;
1053
1054         spin_lock_bh(&vxlan->hash_lock);
1055         err = vxlan_fdb_update(vxlan, addr, &ip, ndm->ndm_state, flags,
1056                                port, src_vni, vni, ifindex,
1057                                ndm->ndm_flags | NTF_VXLAN_ADDED_BY_USER,
1058                                true);
1059         spin_unlock_bh(&vxlan->hash_lock);
1060
1061         return err;
1062 }
1063
1064 static int __vxlan_fdb_delete(struct vxlan_dev *vxlan,
1065                               const unsigned char *addr, union vxlan_addr ip,
1066                               __be16 port, __be32 src_vni, __be32 vni,
1067                               u32 ifindex, bool swdev_notify)
1068 {
1069         struct vxlan_fdb *f;
1070         struct vxlan_rdst *rd = NULL;
1071         int err = -ENOENT;
1072
1073         f = vxlan_find_mac(vxlan, addr, src_vni);
1074         if (!f)
1075                 return err;
1076
1077         if (!vxlan_addr_any(&ip)) {
1078                 rd = vxlan_fdb_find_rdst(f, &ip, port, vni, ifindex);
1079                 if (!rd)
1080                         goto out;
1081         }
1082
1083         /* remove a destination if it's not the only one on the list,
1084          * otherwise destroy the fdb entry
1085          */
1086         if (rd && !list_is_singular(&f->remotes)) {
1087                 vxlan_fdb_dst_destroy(vxlan, f, rd, swdev_notify);
1088                 goto out;
1089         }
1090
1091         vxlan_fdb_destroy(vxlan, f, true, swdev_notify);
1092
1093 out:
1094         return 0;
1095 }
1096
1097 /* Delete entry (via netlink) */
1098 static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
1099                             struct net_device *dev,
1100                             const unsigned char *addr, u16 vid)
1101 {
1102         struct vxlan_dev *vxlan = netdev_priv(dev);
1103         union vxlan_addr ip;
1104         __be32 src_vni, vni;
1105         __be16 port;
1106         u32 ifindex;
1107         int err;
1108
1109         err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex);
1110         if (err)
1111                 return err;
1112
1113         spin_lock_bh(&vxlan->hash_lock);
1114         err = __vxlan_fdb_delete(vxlan, addr, ip, port, src_vni, vni, ifindex,
1115                                  true);
1116         spin_unlock_bh(&vxlan->hash_lock);
1117
1118         return err;
1119 }
1120
1121 /* Dump forwarding table */
1122 static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
1123                           struct net_device *dev,
1124                           struct net_device *filter_dev, int *idx)
1125 {
1126         struct vxlan_dev *vxlan = netdev_priv(dev);
1127         unsigned int h;
1128         int err = 0;
1129
1130         for (h = 0; h < FDB_HASH_SIZE; ++h) {
1131                 struct vxlan_fdb *f;
1132
1133                 hlist_for_each_entry_rcu(f, &vxlan->fdb_head[h], hlist) {
1134                         struct vxlan_rdst *rd;
1135
1136                         list_for_each_entry_rcu(rd, &f->remotes, list) {
1137                                 if (*idx < cb->args[2])
1138                                         goto skip;
1139
1140                                 err = vxlan_fdb_info(skb, vxlan, f,
1141                                                      NETLINK_CB(cb->skb).portid,
1142                                                      cb->nlh->nlmsg_seq,
1143                                                      RTM_NEWNEIGH,
1144                                                      NLM_F_MULTI, rd);
1145                                 if (err < 0)
1146                                         goto out;
1147 skip:
1148                                 *idx += 1;
1149                         }
1150                 }
1151         }
1152 out:
1153         return err;
1154 }
1155
1156 static int vxlan_fdb_get(struct sk_buff *skb,
1157                          struct nlattr *tb[],
1158                          struct net_device *dev,
1159                          const unsigned char *addr,
1160                          u16 vid, u32 portid, u32 seq,
1161                          struct netlink_ext_ack *extack)
1162 {
1163         struct vxlan_dev *vxlan = netdev_priv(dev);
1164         struct vxlan_fdb *f;
1165         __be32 vni;
1166         int err;
1167
1168         if (tb[NDA_VNI])
1169                 vni = cpu_to_be32(nla_get_u32(tb[NDA_VNI]));
1170         else
1171                 vni = vxlan->default_dst.remote_vni;
1172
1173         rcu_read_lock();
1174
1175         f = __vxlan_find_mac(vxlan, addr, vni);
1176         if (!f) {
1177                 NL_SET_ERR_MSG(extack, "Fdb entry not found");
1178                 err = -ENOENT;
1179                 goto errout;
1180         }
1181
1182         err = vxlan_fdb_info(skb, vxlan, f, portid, seq,
1183                              RTM_NEWNEIGH, 0, first_remote_rcu(f));
1184 errout:
1185         rcu_read_unlock();
1186         return err;
1187 }
1188
1189 /* Watch incoming packets to learn mapping between Ethernet address
1190  * and Tunnel endpoint.
1191  * Return true if packet is bogus and should be dropped.
1192  */
1193 static bool vxlan_snoop(struct net_device *dev,
1194                         union vxlan_addr *src_ip, const u8 *src_mac,
1195                         u32 src_ifindex, __be32 vni)
1196 {
1197         struct vxlan_dev *vxlan = netdev_priv(dev);
1198         struct vxlan_fdb *f;
1199         u32 ifindex = 0;
1200
1201 #if IS_ENABLED(CONFIG_IPV6)
1202         if (src_ip->sa.sa_family == AF_INET6 &&
1203             (ipv6_addr_type(&src_ip->sin6.sin6_addr) & IPV6_ADDR_LINKLOCAL))
1204                 ifindex = src_ifindex;
1205 #endif
1206
1207         f = vxlan_find_mac(vxlan, src_mac, vni);
1208         if (likely(f)) {
1209                 struct vxlan_rdst *rdst = first_remote_rcu(f);
1210
1211                 if (likely(vxlan_addr_equal(&rdst->remote_ip, src_ip) &&
1212                            rdst->remote_ifindex == ifindex))
1213                         return false;
1214
1215                 /* Don't migrate static entries, drop packets */
1216                 if (f->state & (NUD_PERMANENT | NUD_NOARP))
1217                         return true;
1218
1219                 if (net_ratelimit())
1220                         netdev_info(dev,
1221                                     "%pM migrated from %pIS to %pIS\n",
1222                                     src_mac, &rdst->remote_ip.sa, &src_ip->sa);
1223
1224                 rdst->remote_ip = *src_ip;
1225                 f->updated = jiffies;
1226                 vxlan_fdb_notify(vxlan, f, rdst, RTM_NEWNEIGH, true);
1227         } else {
1228                 /* learned new entry */
1229                 spin_lock(&vxlan->hash_lock);
1230
1231                 /* close off race between vxlan_flush and incoming packets */
1232                 if (netif_running(dev))
1233                         vxlan_fdb_update(vxlan, src_mac, src_ip,
1234                                          NUD_REACHABLE,
1235                                          NLM_F_EXCL|NLM_F_CREATE,
1236                                          vxlan->cfg.dst_port,
1237                                          vni,
1238                                          vxlan->default_dst.remote_vni,
1239                                          ifindex, NTF_SELF, true);
1240                 spin_unlock(&vxlan->hash_lock);
1241         }
1242
1243         return false;
1244 }
1245
1246 /* See if multicast group is already in use by other ID */
1247 static bool vxlan_group_used(struct vxlan_net *vn, struct vxlan_dev *dev)
1248 {
1249         struct vxlan_dev *vxlan;
1250         struct vxlan_sock *sock4;
1251 #if IS_ENABLED(CONFIG_IPV6)
1252         struct vxlan_sock *sock6;
1253 #endif
1254         unsigned short family = dev->default_dst.remote_ip.sa.sa_family;
1255
1256         sock4 = rtnl_dereference(dev->vn4_sock);
1257
1258         /* The vxlan_sock is only used by dev, leaving group has
1259          * no effect on other vxlan devices.
1260          */
1261         if (family == AF_INET && sock4 && refcount_read(&sock4->refcnt) == 1)
1262                 return false;
1263 #if IS_ENABLED(CONFIG_IPV6)
1264         sock6 = rtnl_dereference(dev->vn6_sock);
1265         if (family == AF_INET6 && sock6 && refcount_read(&sock6->refcnt) == 1)
1266                 return false;
1267 #endif
1268
1269         list_for_each_entry(vxlan, &vn->vxlan_list, next) {
1270                 if (!netif_running(vxlan->dev) || vxlan == dev)
1271                         continue;
1272
1273                 if (family == AF_INET &&
1274                     rtnl_dereference(vxlan->vn4_sock) != sock4)
1275                         continue;
1276 #if IS_ENABLED(CONFIG_IPV6)
1277                 if (family == AF_INET6 &&
1278                     rtnl_dereference(vxlan->vn6_sock) != sock6)
1279                         continue;
1280 #endif
1281
1282                 if (!vxlan_addr_equal(&vxlan->default_dst.remote_ip,
1283                                       &dev->default_dst.remote_ip))
1284                         continue;
1285
1286                 if (vxlan->default_dst.remote_ifindex !=
1287                     dev->default_dst.remote_ifindex)
1288                         continue;
1289
1290                 return true;
1291         }
1292
1293         return false;
1294 }
1295
1296 static bool __vxlan_sock_release_prep(struct vxlan_sock *vs)
1297 {
1298         struct vxlan_net *vn;
1299
1300         if (!vs)
1301                 return false;
1302         if (!refcount_dec_and_test(&vs->refcnt))
1303                 return false;
1304
1305         vn = net_generic(sock_net(vs->sock->sk), vxlan_net_id);
1306         spin_lock(&vn->sock_lock);
1307         hlist_del_rcu(&vs->hlist);
1308         udp_tunnel_notify_del_rx_port(vs->sock,
1309                                       (vs->flags & VXLAN_F_GPE) ?
1310                                       UDP_TUNNEL_TYPE_VXLAN_GPE :
1311                                       UDP_TUNNEL_TYPE_VXLAN);
1312         spin_unlock(&vn->sock_lock);
1313
1314         return true;
1315 }
1316
1317 static void vxlan_sock_release(struct vxlan_dev *vxlan)
1318 {
1319         struct vxlan_sock *sock4 = rtnl_dereference(vxlan->vn4_sock);
1320 #if IS_ENABLED(CONFIG_IPV6)
1321         struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock);
1322
1323         RCU_INIT_POINTER(vxlan->vn6_sock, NULL);
1324 #endif
1325
1326         RCU_INIT_POINTER(vxlan->vn4_sock, NULL);
1327         synchronize_net();
1328
1329         vxlan_vs_del_dev(vxlan);
1330
1331         if (__vxlan_sock_release_prep(sock4)) {
1332                 udp_tunnel_sock_release(sock4->sock);
1333                 kfree(sock4);
1334         }
1335
1336 #if IS_ENABLED(CONFIG_IPV6)
1337         if (__vxlan_sock_release_prep(sock6)) {
1338                 udp_tunnel_sock_release(sock6->sock);
1339                 kfree(sock6);
1340         }
1341 #endif
1342 }
1343
1344 /* Update multicast group membership when first VNI on
1345  * multicast address is brought up
1346  */
1347 static int vxlan_igmp_join(struct vxlan_dev *vxlan)
1348 {
1349         struct sock *sk;
1350         union vxlan_addr *ip = &vxlan->default_dst.remote_ip;
1351         int ifindex = vxlan->default_dst.remote_ifindex;
1352         int ret = -EINVAL;
1353
1354         if (ip->sa.sa_family == AF_INET) {
1355                 struct vxlan_sock *sock4 = rtnl_dereference(vxlan->vn4_sock);
1356                 struct ip_mreqn mreq = {
1357                         .imr_multiaddr.s_addr   = ip->sin.sin_addr.s_addr,
1358                         .imr_ifindex            = ifindex,
1359                 };
1360
1361                 sk = sock4->sock->sk;
1362                 lock_sock(sk);
1363                 ret = ip_mc_join_group(sk, &mreq);
1364                 release_sock(sk);
1365 #if IS_ENABLED(CONFIG_IPV6)
1366         } else {
1367                 struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock);
1368
1369                 sk = sock6->sock->sk;
1370                 lock_sock(sk);
1371                 ret = ipv6_stub->ipv6_sock_mc_join(sk, ifindex,
1372                                                    &ip->sin6.sin6_addr);
1373                 release_sock(sk);
1374 #endif
1375         }
1376
1377         return ret;
1378 }
1379
1380 /* Inverse of vxlan_igmp_join when last VNI is brought down */
1381 static int vxlan_igmp_leave(struct vxlan_dev *vxlan)
1382 {
1383         struct sock *sk;
1384         union vxlan_addr *ip = &vxlan->default_dst.remote_ip;
1385         int ifindex = vxlan->default_dst.remote_ifindex;
1386         int ret = -EINVAL;
1387
1388         if (ip->sa.sa_family == AF_INET) {
1389                 struct vxlan_sock *sock4 = rtnl_dereference(vxlan->vn4_sock);
1390                 struct ip_mreqn mreq = {
1391                         .imr_multiaddr.s_addr   = ip->sin.sin_addr.s_addr,
1392                         .imr_ifindex            = ifindex,
1393                 };
1394
1395                 sk = sock4->sock->sk;
1396                 lock_sock(sk);
1397                 ret = ip_mc_leave_group(sk, &mreq);
1398                 release_sock(sk);
1399 #if IS_ENABLED(CONFIG_IPV6)
1400         } else {
1401                 struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock);
1402
1403                 sk = sock6->sock->sk;
1404                 lock_sock(sk);
1405                 ret = ipv6_stub->ipv6_sock_mc_drop(sk, ifindex,
1406                                                    &ip->sin6.sin6_addr);
1407                 release_sock(sk);
1408 #endif
1409         }
1410
1411         return ret;
1412 }
1413
1414 static bool vxlan_remcsum(struct vxlanhdr *unparsed,
1415                           struct sk_buff *skb, u32 vxflags)
1416 {
1417         size_t start, offset;
1418
1419         if (!(unparsed->vx_flags & VXLAN_HF_RCO) || skb->remcsum_offload)
1420                 goto out;
1421
1422         start = vxlan_rco_start(unparsed->vx_vni);
1423         offset = start + vxlan_rco_offset(unparsed->vx_vni);
1424
1425         if (!pskb_may_pull(skb, offset + sizeof(u16)))
1426                 return false;
1427
1428         skb_remcsum_process(skb, (void *)(vxlan_hdr(skb) + 1), start, offset,
1429                             !!(vxflags & VXLAN_F_REMCSUM_NOPARTIAL));
1430 out:
1431         unparsed->vx_flags &= ~VXLAN_HF_RCO;
1432         unparsed->vx_vni &= VXLAN_VNI_MASK;
1433         return true;
1434 }
1435
1436 static void vxlan_parse_gbp_hdr(struct vxlanhdr *unparsed,
1437                                 struct sk_buff *skb, u32 vxflags,
1438                                 struct vxlan_metadata *md)
1439 {
1440         struct vxlanhdr_gbp *gbp = (struct vxlanhdr_gbp *)unparsed;
1441         struct metadata_dst *tun_dst;
1442
1443         if (!(unparsed->vx_flags & VXLAN_HF_GBP))
1444                 goto out;
1445
1446         md->gbp = ntohs(gbp->policy_id);
1447
1448         tun_dst = (struct metadata_dst *)skb_dst(skb);
1449         if (tun_dst) {
1450                 tun_dst->u.tun_info.key.tun_flags |= TUNNEL_VXLAN_OPT;
1451                 tun_dst->u.tun_info.options_len = sizeof(*md);
1452         }
1453         if (gbp->dont_learn)
1454                 md->gbp |= VXLAN_GBP_DONT_LEARN;
1455
1456         if (gbp->policy_applied)
1457                 md->gbp |= VXLAN_GBP_POLICY_APPLIED;
1458
1459         /* In flow-based mode, GBP is carried in dst_metadata */
1460         if (!(vxflags & VXLAN_F_COLLECT_METADATA))
1461                 skb->mark = md->gbp;
1462 out:
1463         unparsed->vx_flags &= ~VXLAN_GBP_USED_BITS;
1464 }
1465
1466 static bool vxlan_parse_gpe_hdr(struct vxlanhdr *unparsed,
1467                                 __be16 *protocol,
1468                                 struct sk_buff *skb, u32 vxflags)
1469 {
1470         struct vxlanhdr_gpe *gpe = (struct vxlanhdr_gpe *)unparsed;
1471
1472         /* Need to have Next Protocol set for interfaces in GPE mode. */
1473         if (!gpe->np_applied)
1474                 return false;
1475         /* "The initial version is 0. If a receiver does not support the
1476          * version indicated it MUST drop the packet.
1477          */
1478         if (gpe->version != 0)
1479                 return false;
1480         /* "When the O bit is set to 1, the packet is an OAM packet and OAM
1481          * processing MUST occur." However, we don't implement OAM
1482          * processing, thus drop the packet.
1483          */
1484         if (gpe->oam_flag)
1485                 return false;
1486
1487         *protocol = tun_p_to_eth_p(gpe->next_protocol);
1488         if (!*protocol)
1489                 return false;
1490
1491         unparsed->vx_flags &= ~VXLAN_GPE_USED_BITS;
1492         return true;
1493 }
1494
1495 static bool vxlan_set_mac(struct vxlan_dev *vxlan,
1496                           struct vxlan_sock *vs,
1497                           struct sk_buff *skb, __be32 vni)
1498 {
1499         union vxlan_addr saddr;
1500         u32 ifindex = skb->dev->ifindex;
1501
1502         skb_reset_mac_header(skb);
1503         skb->protocol = eth_type_trans(skb, vxlan->dev);
1504         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1505
1506         /* Ignore packet loops (and multicast echo) */
1507         if (ether_addr_equal(eth_hdr(skb)->h_source, vxlan->dev->dev_addr))
1508                 return false;
1509
1510         /* Get address from the outer IP header */
1511         if (vxlan_get_sk_family(vs) == AF_INET) {
1512                 saddr.sin.sin_addr.s_addr = ip_hdr(skb)->saddr;
1513                 saddr.sa.sa_family = AF_INET;
1514 #if IS_ENABLED(CONFIG_IPV6)
1515         } else {
1516                 saddr.sin6.sin6_addr = ipv6_hdr(skb)->saddr;
1517                 saddr.sa.sa_family = AF_INET6;
1518 #endif
1519         }
1520
1521         if ((vxlan->cfg.flags & VXLAN_F_LEARN) &&
1522             vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source, ifindex, vni))
1523                 return false;
1524
1525         return true;
1526 }
1527
1528 static bool vxlan_ecn_decapsulate(struct vxlan_sock *vs, void *oiph,
1529                                   struct sk_buff *skb)
1530 {
1531         int err = 0;
1532
1533         if (vxlan_get_sk_family(vs) == AF_INET)
1534                 err = IP_ECN_decapsulate(oiph, skb);
1535 #if IS_ENABLED(CONFIG_IPV6)
1536         else
1537                 err = IP6_ECN_decapsulate(oiph, skb);
1538 #endif
1539
1540         if (unlikely(err) && log_ecn_error) {
1541                 if (vxlan_get_sk_family(vs) == AF_INET)
1542                         net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
1543                                              &((struct iphdr *)oiph)->saddr,
1544                                              ((struct iphdr *)oiph)->tos);
1545                 else
1546                         net_info_ratelimited("non-ECT from %pI6\n",
1547                                              &((struct ipv6hdr *)oiph)->saddr);
1548         }
1549         return err <= 1;
1550 }
1551
1552 /* Callback from net/ipv4/udp.c to receive packets */
1553 static int vxlan_rcv(struct sock *sk, struct sk_buff *skb)
1554 {
1555         struct pcpu_sw_netstats *stats;
1556         struct vxlan_dev *vxlan;
1557         struct vxlan_sock *vs;
1558         struct vxlanhdr unparsed;
1559         struct vxlan_metadata _md;
1560         struct vxlan_metadata *md = &_md;
1561         __be16 protocol = htons(ETH_P_TEB);
1562         bool raw_proto = false;
1563         void *oiph;
1564         __be32 vni = 0;
1565
1566         /* Need UDP and VXLAN header to be present */
1567         if (!pskb_may_pull(skb, VXLAN_HLEN))
1568                 goto drop;
1569
1570         unparsed = *vxlan_hdr(skb);
1571         /* VNI flag always required to be set */
1572         if (!(unparsed.vx_flags & VXLAN_HF_VNI)) {
1573                 netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n",
1574                            ntohl(vxlan_hdr(skb)->vx_flags),
1575                            ntohl(vxlan_hdr(skb)->vx_vni));
1576                 /* Return non vxlan pkt */
1577                 goto drop;
1578         }
1579         unparsed.vx_flags &= ~VXLAN_HF_VNI;
1580         unparsed.vx_vni &= ~VXLAN_VNI_MASK;
1581
1582         vs = rcu_dereference_sk_user_data(sk);
1583         if (!vs)
1584                 goto drop;
1585
1586         vni = vxlan_vni(vxlan_hdr(skb)->vx_vni);
1587
1588         vxlan = vxlan_vs_find_vni(vs, skb->dev->ifindex, vni);
1589         if (!vxlan)
1590                 goto drop;
1591
1592         /* For backwards compatibility, only allow reserved fields to be
1593          * used by VXLAN extensions if explicitly requested.
1594          */
1595         if (vs->flags & VXLAN_F_GPE) {
1596                 if (!vxlan_parse_gpe_hdr(&unparsed, &protocol, skb, vs->flags))
1597                         goto drop;
1598                 raw_proto = true;
1599         }
1600
1601         if (__iptunnel_pull_header(skb, VXLAN_HLEN, protocol, raw_proto,
1602                                    !net_eq(vxlan->net, dev_net(vxlan->dev))))
1603                         goto drop;
1604
1605         if (vxlan_collect_metadata(vs)) {
1606                 struct metadata_dst *tun_dst;
1607
1608                 tun_dst = udp_tun_rx_dst(skb, vxlan_get_sk_family(vs), TUNNEL_KEY,
1609                                          key32_to_tunnel_id(vni), sizeof(*md));
1610
1611                 if (!tun_dst)
1612                         goto drop;
1613
1614                 md = ip_tunnel_info_opts(&tun_dst->u.tun_info);
1615
1616                 skb_dst_set(skb, (struct dst_entry *)tun_dst);
1617         } else {
1618                 memset(md, 0, sizeof(*md));
1619         }
1620
1621         if (vs->flags & VXLAN_F_REMCSUM_RX)
1622                 if (!vxlan_remcsum(&unparsed, skb, vs->flags))
1623                         goto drop;
1624         if (vs->flags & VXLAN_F_GBP)
1625                 vxlan_parse_gbp_hdr(&unparsed, skb, vs->flags, md);
1626         /* Note that GBP and GPE can never be active together. This is
1627          * ensured in vxlan_dev_configure.
1628          */
1629
1630         if (unparsed.vx_flags || unparsed.vx_vni) {
1631                 /* If there are any unprocessed flags remaining treat
1632                  * this as a malformed packet. This behavior diverges from
1633                  * VXLAN RFC (RFC7348) which stipulates that bits in reserved
1634                  * in reserved fields are to be ignored. The approach here
1635                  * maintains compatibility with previous stack code, and also
1636                  * is more robust and provides a little more security in
1637                  * adding extensions to VXLAN.
1638                  */
1639                 goto drop;
1640         }
1641
1642         if (!raw_proto) {
1643                 if (!vxlan_set_mac(vxlan, vs, skb, vni))
1644                         goto drop;
1645         } else {
1646                 skb_reset_mac_header(skb);
1647                 skb->dev = vxlan->dev;
1648                 skb->pkt_type = PACKET_HOST;
1649         }
1650
1651         oiph = skb_network_header(skb);
1652         skb_reset_network_header(skb);
1653
1654         if (!vxlan_ecn_decapsulate(vs, oiph, skb)) {
1655                 ++vxlan->dev->stats.rx_frame_errors;
1656                 ++vxlan->dev->stats.rx_errors;
1657                 goto drop;
1658         }
1659
1660         stats = this_cpu_ptr(vxlan->dev->tstats);
1661         u64_stats_update_begin(&stats->syncp);
1662         stats->rx_packets++;
1663         stats->rx_bytes += skb->len;
1664         u64_stats_update_end(&stats->syncp);
1665
1666         gro_cells_receive(&vxlan->gro_cells, skb);
1667         return 0;
1668
1669 drop:
1670         /* Consume bad packet */
1671         kfree_skb(skb);
1672         return 0;
1673 }
1674
1675 /* Callback from net/ipv{4,6}/udp.c to check that we have a VNI for errors */
1676 static int vxlan_err_lookup(struct sock *sk, struct sk_buff *skb)
1677 {
1678         struct vxlan_dev *vxlan;
1679         struct vxlan_sock *vs;
1680         struct vxlanhdr *hdr;
1681         __be32 vni;
1682
1683         if (skb->len < VXLAN_HLEN)
1684                 return -EINVAL;
1685
1686         hdr = vxlan_hdr(skb);
1687
1688         if (!(hdr->vx_flags & VXLAN_HF_VNI))
1689                 return -EINVAL;
1690
1691         vs = rcu_dereference_sk_user_data(sk);
1692         if (!vs)
1693                 return -ENOENT;
1694
1695         vni = vxlan_vni(hdr->vx_vni);
1696         vxlan = vxlan_vs_find_vni(vs, skb->dev->ifindex, vni);
1697         if (!vxlan)
1698                 return -ENOENT;
1699
1700         return 0;
1701 }
1702
1703 static int arp_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni)
1704 {
1705         struct vxlan_dev *vxlan = netdev_priv(dev);
1706         struct arphdr *parp;
1707         u8 *arpptr, *sha;
1708         __be32 sip, tip;
1709         struct neighbour *n;
1710
1711         if (dev->flags & IFF_NOARP)
1712                 goto out;
1713
1714         if (!pskb_may_pull(skb, arp_hdr_len(dev))) {
1715                 dev->stats.tx_dropped++;
1716                 goto out;
1717         }
1718         parp = arp_hdr(skb);
1719
1720         if ((parp->ar_hrd != htons(ARPHRD_ETHER) &&
1721              parp->ar_hrd != htons(ARPHRD_IEEE802)) ||
1722             parp->ar_pro != htons(ETH_P_IP) ||
1723             parp->ar_op != htons(ARPOP_REQUEST) ||
1724             parp->ar_hln != dev->addr_len ||
1725             parp->ar_pln != 4)
1726                 goto out;
1727         arpptr = (u8 *)parp + sizeof(struct arphdr);
1728         sha = arpptr;
1729         arpptr += dev->addr_len;        /* sha */
1730         memcpy(&sip, arpptr, sizeof(sip));
1731         arpptr += sizeof(sip);
1732         arpptr += dev->addr_len;        /* tha */
1733         memcpy(&tip, arpptr, sizeof(tip));
1734
1735         if (ipv4_is_loopback(tip) ||
1736             ipv4_is_multicast(tip))
1737                 goto out;
1738
1739         n = neigh_lookup(&arp_tbl, &tip, dev);
1740
1741         if (n) {
1742                 struct vxlan_fdb *f;
1743                 struct sk_buff  *reply;
1744
1745                 if (!(n->nud_state & NUD_CONNECTED)) {
1746                         neigh_release(n);
1747                         goto out;
1748                 }
1749
1750                 f = vxlan_find_mac(vxlan, n->ha, vni);
1751                 if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) {
1752                         /* bridge-local neighbor */
1753                         neigh_release(n);
1754                         goto out;
1755                 }
1756
1757                 reply = arp_create(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha,
1758                                 n->ha, sha);
1759
1760                 neigh_release(n);
1761
1762                 if (reply == NULL)
1763                         goto out;
1764
1765                 skb_reset_mac_header(reply);
1766                 __skb_pull(reply, skb_network_offset(reply));
1767                 reply->ip_summed = CHECKSUM_UNNECESSARY;
1768                 reply->pkt_type = PACKET_HOST;
1769
1770                 if (netif_rx_ni(reply) == NET_RX_DROP)
1771                         dev->stats.rx_dropped++;
1772         } else if (vxlan->cfg.flags & VXLAN_F_L3MISS) {
1773                 union vxlan_addr ipa = {
1774                         .sin.sin_addr.s_addr = tip,
1775                         .sin.sin_family = AF_INET,
1776                 };
1777
1778                 vxlan_ip_miss(dev, &ipa);
1779         }
1780 out:
1781         consume_skb(skb);
1782         return NETDEV_TX_OK;
1783 }
1784
1785 #if IS_ENABLED(CONFIG_IPV6)
1786 static struct sk_buff *vxlan_na_create(struct sk_buff *request,
1787         struct neighbour *n, bool isrouter)
1788 {
1789         struct net_device *dev = request->dev;
1790         struct sk_buff *reply;
1791         struct nd_msg *ns, *na;
1792         struct ipv6hdr *pip6;
1793         u8 *daddr;
1794         int na_olen = 8; /* opt hdr + ETH_ALEN for target */
1795         int ns_olen;
1796         int i, len;
1797
1798         if (dev == NULL || !pskb_may_pull(request, request->len))
1799                 return NULL;
1800
1801         len = LL_RESERVED_SPACE(dev) + sizeof(struct ipv6hdr) +
1802                 sizeof(*na) + na_olen + dev->needed_tailroom;
1803         reply = alloc_skb(len, GFP_ATOMIC);
1804         if (reply == NULL)
1805                 return NULL;
1806
1807         reply->protocol = htons(ETH_P_IPV6);
1808         reply->dev = dev;
1809         skb_reserve(reply, LL_RESERVED_SPACE(request->dev));
1810         skb_push(reply, sizeof(struct ethhdr));
1811         skb_reset_mac_header(reply);
1812
1813         ns = (struct nd_msg *)(ipv6_hdr(request) + 1);
1814
1815         daddr = eth_hdr(request)->h_source;
1816         ns_olen = request->len - skb_network_offset(request) -
1817                 sizeof(struct ipv6hdr) - sizeof(*ns);
1818         for (i = 0; i < ns_olen-1; i += (ns->opt[i+1]<<3)) {
1819                 if (ns->opt[i] == ND_OPT_SOURCE_LL_ADDR) {
1820                         daddr = ns->opt + i + sizeof(struct nd_opt_hdr);
1821                         break;
1822                 }
1823         }
1824
1825         /* Ethernet header */
1826         ether_addr_copy(eth_hdr(reply)->h_dest, daddr);
1827         ether_addr_copy(eth_hdr(reply)->h_source, n->ha);
1828         eth_hdr(reply)->h_proto = htons(ETH_P_IPV6);
1829         reply->protocol = htons(ETH_P_IPV6);
1830
1831         skb_pull(reply, sizeof(struct ethhdr));
1832         skb_reset_network_header(reply);
1833         skb_put(reply, sizeof(struct ipv6hdr));
1834
1835         /* IPv6 header */
1836
1837         pip6 = ipv6_hdr(reply);
1838         memset(pip6, 0, sizeof(struct ipv6hdr));
1839         pip6->version = 6;
1840         pip6->priority = ipv6_hdr(request)->priority;
1841         pip6->nexthdr = IPPROTO_ICMPV6;
1842         pip6->hop_limit = 255;
1843         pip6->daddr = ipv6_hdr(request)->saddr;
1844         pip6->saddr = *(struct in6_addr *)n->primary_key;
1845
1846         skb_pull(reply, sizeof(struct ipv6hdr));
1847         skb_reset_transport_header(reply);
1848
1849         /* Neighbor Advertisement */
1850         na = skb_put_zero(reply, sizeof(*na) + na_olen);
1851         na->icmph.icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT;
1852         na->icmph.icmp6_router = isrouter;
1853         na->icmph.icmp6_override = 1;
1854         na->icmph.icmp6_solicited = 1;
1855         na->target = ns->target;
1856         ether_addr_copy(&na->opt[2], n->ha);
1857         na->opt[0] = ND_OPT_TARGET_LL_ADDR;
1858         na->opt[1] = na_olen >> 3;
1859
1860         na->icmph.icmp6_cksum = csum_ipv6_magic(&pip6->saddr,
1861                 &pip6->daddr, sizeof(*na)+na_olen, IPPROTO_ICMPV6,
1862                 csum_partial(na, sizeof(*na)+na_olen, 0));
1863
1864         pip6->payload_len = htons(sizeof(*na)+na_olen);
1865
1866         skb_push(reply, sizeof(struct ipv6hdr));
1867
1868         reply->ip_summed = CHECKSUM_UNNECESSARY;
1869
1870         return reply;
1871 }
1872
1873 static int neigh_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni)
1874 {
1875         struct vxlan_dev *vxlan = netdev_priv(dev);
1876         const struct in6_addr *daddr;
1877         const struct ipv6hdr *iphdr;
1878         struct inet6_dev *in6_dev;
1879         struct neighbour *n;
1880         struct nd_msg *msg;
1881
1882         in6_dev = __in6_dev_get(dev);
1883         if (!in6_dev)
1884                 goto out;
1885
1886         iphdr = ipv6_hdr(skb);
1887         daddr = &iphdr->daddr;
1888         msg = (struct nd_msg *)(iphdr + 1);
1889
1890         if (ipv6_addr_loopback(daddr) ||
1891             ipv6_addr_is_multicast(&msg->target))
1892                 goto out;
1893
1894         n = neigh_lookup(ipv6_stub->nd_tbl, &msg->target, dev);
1895
1896         if (n) {
1897                 struct vxlan_fdb *f;
1898                 struct sk_buff *reply;
1899
1900                 if (!(n->nud_state & NUD_CONNECTED)) {
1901                         neigh_release(n);
1902                         goto out;
1903                 }
1904
1905                 f = vxlan_find_mac(vxlan, n->ha, vni);
1906                 if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) {
1907                         /* bridge-local neighbor */
1908                         neigh_release(n);
1909                         goto out;
1910                 }
1911
1912                 reply = vxlan_na_create(skb, n,
1913                                         !!(f ? f->flags & NTF_ROUTER : 0));
1914
1915                 neigh_release(n);
1916
1917                 if (reply == NULL)
1918                         goto out;
1919
1920                 if (netif_rx_ni(reply) == NET_RX_DROP)
1921                         dev->stats.rx_dropped++;
1922
1923         } else if (vxlan->cfg.flags & VXLAN_F_L3MISS) {
1924                 union vxlan_addr ipa = {
1925                         .sin6.sin6_addr = msg->target,
1926                         .sin6.sin6_family = AF_INET6,
1927                 };
1928
1929                 vxlan_ip_miss(dev, &ipa);
1930         }
1931
1932 out:
1933         consume_skb(skb);
1934         return NETDEV_TX_OK;
1935 }
1936 #endif
1937
1938 static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb)
1939 {
1940         struct vxlan_dev *vxlan = netdev_priv(dev);
1941         struct neighbour *n;
1942
1943         if (is_multicast_ether_addr(eth_hdr(skb)->h_dest))
1944                 return false;
1945
1946         n = NULL;
1947         switch (ntohs(eth_hdr(skb)->h_proto)) {
1948         case ETH_P_IP:
1949         {
1950                 struct iphdr *pip;
1951
1952                 if (!pskb_may_pull(skb, sizeof(struct iphdr)))
1953                         return false;
1954                 pip = ip_hdr(skb);
1955                 n = neigh_lookup(&arp_tbl, &pip->daddr, dev);
1956                 if (!n && (vxlan->cfg.flags & VXLAN_F_L3MISS)) {
1957                         union vxlan_addr ipa = {
1958                                 .sin.sin_addr.s_addr = pip->daddr,
1959                                 .sin.sin_family = AF_INET,
1960                         };
1961
1962                         vxlan_ip_miss(dev, &ipa);
1963                         return false;
1964                 }
1965
1966                 break;
1967         }
1968 #if IS_ENABLED(CONFIG_IPV6)
1969         case ETH_P_IPV6:
1970         {
1971                 struct ipv6hdr *pip6;
1972
1973                 if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
1974                         return false;
1975                 pip6 = ipv6_hdr(skb);
1976                 n = neigh_lookup(ipv6_stub->nd_tbl, &pip6->daddr, dev);
1977                 if (!n && (vxlan->cfg.flags & VXLAN_F_L3MISS)) {
1978                         union vxlan_addr ipa = {
1979                                 .sin6.sin6_addr = pip6->daddr,
1980                                 .sin6.sin6_family = AF_INET6,
1981                         };
1982
1983                         vxlan_ip_miss(dev, &ipa);
1984                         return false;
1985                 }
1986
1987                 break;
1988         }
1989 #endif
1990         default:
1991                 return false;
1992         }
1993
1994         if (n) {
1995                 bool diff;
1996
1997                 diff = !ether_addr_equal(eth_hdr(skb)->h_dest, n->ha);
1998                 if (diff) {
1999                         memcpy(eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
2000                                 dev->addr_len);
2001                         memcpy(eth_hdr(skb)->h_dest, n->ha, dev->addr_len);
2002                 }
2003                 neigh_release(n);
2004                 return diff;
2005         }
2006
2007         return false;
2008 }
2009
2010 static void vxlan_build_gbp_hdr(struct vxlanhdr *vxh, u32 vxflags,
2011                                 struct vxlan_metadata *md)
2012 {
2013         struct vxlanhdr_gbp *gbp;
2014
2015         if (!md->gbp)
2016                 return;
2017
2018         gbp = (struct vxlanhdr_gbp *)vxh;
2019         vxh->vx_flags |= VXLAN_HF_GBP;
2020
2021         if (md->gbp & VXLAN_GBP_DONT_LEARN)
2022                 gbp->dont_learn = 1;
2023
2024         if (md->gbp & VXLAN_GBP_POLICY_APPLIED)
2025                 gbp->policy_applied = 1;
2026
2027         gbp->policy_id = htons(md->gbp & VXLAN_GBP_ID_MASK);
2028 }
2029
2030 static int vxlan_build_gpe_hdr(struct vxlanhdr *vxh, u32 vxflags,
2031                                __be16 protocol)
2032 {
2033         struct vxlanhdr_gpe *gpe = (struct vxlanhdr_gpe *)vxh;
2034
2035         gpe->np_applied = 1;
2036         gpe->next_protocol = tun_p_from_eth_p(protocol);
2037         if (!gpe->next_protocol)
2038                 return -EPFNOSUPPORT;
2039         return 0;
2040 }
2041
2042 static int vxlan_build_skb(struct sk_buff *skb, struct dst_entry *dst,
2043                            int iphdr_len, __be32 vni,
2044                            struct vxlan_metadata *md, u32 vxflags,
2045                            bool udp_sum)
2046 {
2047         struct vxlanhdr *vxh;
2048         int min_headroom;
2049         int err;
2050         int type = udp_sum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;
2051         __be16 inner_protocol = htons(ETH_P_TEB);
2052
2053         if ((vxflags & VXLAN_F_REMCSUM_TX) &&
2054             skb->ip_summed == CHECKSUM_PARTIAL) {
2055                 int csum_start = skb_checksum_start_offset(skb);
2056
2057                 if (csum_start <= VXLAN_MAX_REMCSUM_START &&
2058                     !(csum_start & VXLAN_RCO_SHIFT_MASK) &&
2059                     (skb->csum_offset == offsetof(struct udphdr, check) ||
2060                      skb->csum_offset == offsetof(struct tcphdr, check)))
2061                         type |= SKB_GSO_TUNNEL_REMCSUM;
2062         }
2063
2064         min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len
2065                         + VXLAN_HLEN + iphdr_len;
2066
2067         /* Need space for new headers (invalidates iph ptr) */
2068         err = skb_cow_head(skb, min_headroom);
2069         if (unlikely(err))
2070                 return err;
2071
2072         err = iptunnel_handle_offloads(skb, type);
2073         if (err)
2074                 return err;
2075
2076         vxh = __skb_push(skb, sizeof(*vxh));
2077         vxh->vx_flags = VXLAN_HF_VNI;
2078         vxh->vx_vni = vxlan_vni_field(vni);
2079
2080         if (type & SKB_GSO_TUNNEL_REMCSUM) {
2081                 unsigned int start;
2082
2083                 start = skb_checksum_start_offset(skb) - sizeof(struct vxlanhdr);
2084                 vxh->vx_vni |= vxlan_compute_rco(start, skb->csum_offset);
2085                 vxh->vx_flags |= VXLAN_HF_RCO;
2086
2087                 if (!skb_is_gso(skb)) {
2088                         skb->ip_summed = CHECKSUM_NONE;
2089                         skb->encapsulation = 0;
2090                 }
2091         }
2092
2093         if (vxflags & VXLAN_F_GBP)
2094                 vxlan_build_gbp_hdr(vxh, vxflags, md);
2095         if (vxflags & VXLAN_F_GPE) {
2096                 err = vxlan_build_gpe_hdr(vxh, vxflags, skb->protocol);
2097                 if (err < 0)
2098                         return err;
2099                 inner_protocol = skb->protocol;
2100         }
2101
2102         skb_set_inner_protocol(skb, inner_protocol);
2103         return 0;
2104 }
2105
2106 static struct rtable *vxlan_get_route(struct vxlan_dev *vxlan, struct net_device *dev,
2107                                       struct vxlan_sock *sock4,
2108                                       struct sk_buff *skb, int oif, u8 tos,
2109                                       __be32 daddr, __be32 *saddr, __be16 dport, __be16 sport,
2110                                       struct dst_cache *dst_cache,
2111                                       const struct ip_tunnel_info *info)
2112 {
2113         bool use_cache = ip_tunnel_dst_cache_usable(skb, info);
2114         struct rtable *rt = NULL;
2115         struct flowi4 fl4;
2116
2117         if (!sock4)
2118                 return ERR_PTR(-EIO);
2119
2120         if (tos && !info)
2121                 use_cache = false;
2122         if (use_cache) {
2123                 rt = dst_cache_get_ip4(dst_cache, saddr);
2124                 if (rt)
2125                         return rt;
2126         }
2127
2128         memset(&fl4, 0, sizeof(fl4));
2129         fl4.flowi4_oif = oif;
2130         fl4.flowi4_tos = RT_TOS(tos);
2131         fl4.flowi4_mark = skb->mark;
2132         fl4.flowi4_proto = IPPROTO_UDP;
2133         fl4.daddr = daddr;
2134         fl4.saddr = *saddr;
2135         fl4.fl4_dport = dport;
2136         fl4.fl4_sport = sport;
2137
2138         rt = ip_route_output_key(vxlan->net, &fl4);
2139         if (likely(!IS_ERR(rt))) {
2140                 if (rt->dst.dev == dev) {
2141                         netdev_dbg(dev, "circular route to %pI4\n", &daddr);
2142                         ip_rt_put(rt);
2143                         return ERR_PTR(-ELOOP);
2144                 }
2145
2146                 *saddr = fl4.saddr;
2147                 if (use_cache)
2148                         dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
2149         } else {
2150                 netdev_dbg(dev, "no route to %pI4\n", &daddr);
2151                 return ERR_PTR(-ENETUNREACH);
2152         }
2153         return rt;
2154 }
2155
2156 #if IS_ENABLED(CONFIG_IPV6)
2157 static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan,
2158                                           struct net_device *dev,
2159                                           struct vxlan_sock *sock6,
2160                                           struct sk_buff *skb, int oif, u8 tos,
2161                                           __be32 label,
2162                                           const struct in6_addr *daddr,
2163                                           struct in6_addr *saddr,
2164                                           __be16 dport, __be16 sport,
2165                                           struct dst_cache *dst_cache,
2166                                           const struct ip_tunnel_info *info)
2167 {
2168         bool use_cache = ip_tunnel_dst_cache_usable(skb, info);
2169         struct dst_entry *ndst;
2170         struct flowi6 fl6;
2171         int err;
2172
2173         if (!sock6)
2174                 return ERR_PTR(-EIO);
2175
2176         if (tos && !info)
2177                 use_cache = false;
2178         if (use_cache) {
2179                 ndst = dst_cache_get_ip6(dst_cache, saddr);
2180                 if (ndst)
2181                         return ndst;
2182         }
2183
2184         memset(&fl6, 0, sizeof(fl6));
2185         fl6.flowi6_oif = oif;
2186         fl6.daddr = *daddr;
2187         fl6.saddr = *saddr;
2188         fl6.flowlabel = ip6_make_flowinfo(RT_TOS(tos), label);
2189         fl6.flowi6_mark = skb->mark;
2190         fl6.flowi6_proto = IPPROTO_UDP;
2191         fl6.fl6_dport = dport;
2192         fl6.fl6_sport = sport;
2193
2194         err = ipv6_stub->ipv6_dst_lookup(vxlan->net,
2195                                          sock6->sock->sk,
2196                                          &ndst, &fl6);
2197         if (unlikely(err < 0)) {
2198                 netdev_dbg(dev, "no route to %pI6\n", daddr);
2199                 return ERR_PTR(-ENETUNREACH);
2200         }
2201
2202         if (unlikely(ndst->dev == dev)) {
2203                 netdev_dbg(dev, "circular route to %pI6\n", daddr);
2204                 dst_release(ndst);
2205                 return ERR_PTR(-ELOOP);
2206         }
2207
2208         *saddr = fl6.saddr;
2209         if (use_cache)
2210                 dst_cache_set_ip6(dst_cache, ndst, saddr);
2211         return ndst;
2212 }
2213 #endif
2214
2215 /* Bypass encapsulation if the destination is local */
2216 static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan,
2217                                struct vxlan_dev *dst_vxlan, __be32 vni)
2218 {
2219         struct pcpu_sw_netstats *tx_stats, *rx_stats;
2220         union vxlan_addr loopback;
2221         union vxlan_addr *remote_ip = &dst_vxlan->default_dst.remote_ip;
2222         struct net_device *dev = skb->dev;
2223         int len = skb->len;
2224
2225         tx_stats = this_cpu_ptr(src_vxlan->dev->tstats);
2226         rx_stats = this_cpu_ptr(dst_vxlan->dev->tstats);
2227         skb->pkt_type = PACKET_HOST;
2228         skb->encapsulation = 0;
2229         skb->dev = dst_vxlan->dev;
2230         __skb_pull(skb, skb_network_offset(skb));
2231
2232         if (remote_ip->sa.sa_family == AF_INET) {
2233                 loopback.sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
2234                 loopback.sa.sa_family =  AF_INET;
2235 #if IS_ENABLED(CONFIG_IPV6)
2236         } else {
2237                 loopback.sin6.sin6_addr = in6addr_loopback;
2238                 loopback.sa.sa_family =  AF_INET6;
2239 #endif
2240         }
2241
2242         if (dst_vxlan->cfg.flags & VXLAN_F_LEARN)
2243                 vxlan_snoop(skb->dev, &loopback, eth_hdr(skb)->h_source, 0,
2244                             vni);
2245
2246         u64_stats_update_begin(&tx_stats->syncp);
2247         tx_stats->tx_packets++;
2248         tx_stats->tx_bytes += len;
2249         u64_stats_update_end(&tx_stats->syncp);
2250
2251         if (netif_rx(skb) == NET_RX_SUCCESS) {
2252                 u64_stats_update_begin(&rx_stats->syncp);
2253                 rx_stats->rx_packets++;
2254                 rx_stats->rx_bytes += len;
2255                 u64_stats_update_end(&rx_stats->syncp);
2256         } else {
2257                 dev->stats.rx_dropped++;
2258         }
2259 }
2260
2261 static int encap_bypass_if_local(struct sk_buff *skb, struct net_device *dev,
2262                                  struct vxlan_dev *vxlan,
2263                                  union vxlan_addr *daddr,
2264                                  __be16 dst_port, int dst_ifindex, __be32 vni,
2265                                  struct dst_entry *dst,
2266                                  u32 rt_flags)
2267 {
2268 #if IS_ENABLED(CONFIG_IPV6)
2269         /* IPv6 rt-flags are checked against RTF_LOCAL, but the value of
2270          * RTF_LOCAL is equal to RTCF_LOCAL. So to keep code simple
2271          * we can use RTCF_LOCAL which works for ipv4 and ipv6 route entry.
2272          */
2273         BUILD_BUG_ON(RTCF_LOCAL != RTF_LOCAL);
2274 #endif
2275         /* Bypass encapsulation if the destination is local */
2276         if (rt_flags & RTCF_LOCAL &&
2277             !(rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) {
2278                 struct vxlan_dev *dst_vxlan;
2279
2280                 dst_release(dst);
2281                 dst_vxlan = vxlan_find_vni(vxlan->net, dst_ifindex, vni,
2282                                            daddr->sa.sa_family, dst_port,
2283                                            vxlan->cfg.flags);
2284                 if (!dst_vxlan) {
2285                         dev->stats.tx_errors++;
2286                         kfree_skb(skb);
2287
2288                         return -ENOENT;
2289                 }
2290                 vxlan_encap_bypass(skb, vxlan, dst_vxlan, vni);
2291                 return 1;
2292         }
2293
2294         return 0;
2295 }
2296
2297 static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
2298                            __be32 default_vni, struct vxlan_rdst *rdst,
2299                            bool did_rsc)
2300 {
2301         struct dst_cache *dst_cache;
2302         struct ip_tunnel_info *info;
2303         struct vxlan_dev *vxlan = netdev_priv(dev);
2304         const struct iphdr *old_iph = ip_hdr(skb);
2305         union vxlan_addr *dst;
2306         union vxlan_addr remote_ip, local_ip;
2307         struct vxlan_metadata _md;
2308         struct vxlan_metadata *md = &_md;
2309         __be16 src_port = 0, dst_port;
2310         struct dst_entry *ndst = NULL;
2311         __be32 vni, label;
2312         __u8 tos, ttl;
2313         int ifindex;
2314         int err;
2315         u32 flags = vxlan->cfg.flags;
2316         bool udp_sum = false;
2317         bool xnet = !net_eq(vxlan->net, dev_net(vxlan->dev));
2318
2319         info = skb_tunnel_info(skb);
2320
2321         if (rdst) {
2322                 dst = &rdst->remote_ip;
2323                 if (vxlan_addr_any(dst)) {
2324                         if (did_rsc) {
2325                                 /* short-circuited back to local bridge */
2326                                 vxlan_encap_bypass(skb, vxlan, vxlan, default_vni);
2327                                 return;
2328                         }
2329                         goto drop;
2330                 }
2331
2332                 dst_port = rdst->remote_port ? rdst->remote_port : vxlan->cfg.dst_port;
2333                 vni = (rdst->remote_vni) ? : default_vni;
2334                 ifindex = rdst->remote_ifindex;
2335                 local_ip = vxlan->cfg.saddr;
2336                 dst_cache = &rdst->dst_cache;
2337                 md->gbp = skb->mark;
2338                 if (flags & VXLAN_F_TTL_INHERIT) {
2339                         ttl = ip_tunnel_get_ttl(old_iph, skb);
2340                 } else {
2341                         ttl = vxlan->cfg.ttl;
2342                         if (!ttl && vxlan_addr_multicast(dst))
2343                                 ttl = 1;
2344                 }
2345
2346                 tos = vxlan->cfg.tos;
2347                 if (tos == 1)
2348                         tos = ip_tunnel_get_dsfield(old_iph, skb);
2349
2350                 if (dst->sa.sa_family == AF_INET)
2351                         udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM_TX);
2352                 else
2353                         udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM6_TX);
2354                 label = vxlan->cfg.label;
2355         } else {
2356                 if (!info) {
2357                         WARN_ONCE(1, "%s: Missing encapsulation instructions\n",
2358                                   dev->name);
2359                         goto drop;
2360                 }
2361                 remote_ip.sa.sa_family = ip_tunnel_info_af(info);
2362                 if (remote_ip.sa.sa_family == AF_INET) {
2363                         remote_ip.sin.sin_addr.s_addr = info->key.u.ipv4.dst;
2364                         local_ip.sin.sin_addr.s_addr = info->key.u.ipv4.src;
2365                 } else {
2366                         remote_ip.sin6.sin6_addr = info->key.u.ipv6.dst;
2367                         local_ip.sin6.sin6_addr = info->key.u.ipv6.src;
2368                 }
2369                 dst = &remote_ip;
2370                 dst_port = info->key.tp_dst ? : vxlan->cfg.dst_port;
2371                 vni = tunnel_id_to_key32(info->key.tun_id);
2372                 ifindex = 0;
2373                 dst_cache = &info->dst_cache;
2374                 if (info->options_len &&
2375                     info->key.tun_flags & TUNNEL_VXLAN_OPT)
2376                         md = ip_tunnel_info_opts(info);
2377                 ttl = info->key.ttl;
2378                 tos = info->key.tos;
2379                 label = info->key.label;
2380                 udp_sum = !!(info->key.tun_flags & TUNNEL_CSUM);
2381         }
2382         src_port = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min,
2383                                      vxlan->cfg.port_max, true);
2384
2385         rcu_read_lock();
2386         if (dst->sa.sa_family == AF_INET) {
2387                 struct vxlan_sock *sock4 = rcu_dereference(vxlan->vn4_sock);
2388                 struct rtable *rt;
2389                 __be16 df = 0;
2390
2391                 if (!ifindex)
2392                         ifindex = sock4->sock->sk->sk_bound_dev_if;
2393
2394                 rt = vxlan_get_route(vxlan, dev, sock4, skb, ifindex, tos,
2395                                      dst->sin.sin_addr.s_addr,
2396                                      &local_ip.sin.sin_addr.s_addr,
2397                                      dst_port, src_port,
2398                                      dst_cache, info);
2399                 if (IS_ERR(rt)) {
2400                         err = PTR_ERR(rt);
2401                         goto tx_error;
2402                 }
2403
2404                 if (!info) {
2405                         /* Bypass encapsulation if the destination is local */
2406                         err = encap_bypass_if_local(skb, dev, vxlan, dst,
2407                                                     dst_port, ifindex, vni,
2408                                                     &rt->dst, rt->rt_flags);
2409                         if (err)
2410                                 goto out_unlock;
2411
2412                         if (vxlan->cfg.df == VXLAN_DF_SET) {
2413                                 df = htons(IP_DF);
2414                         } else if (vxlan->cfg.df == VXLAN_DF_INHERIT) {
2415                                 struct ethhdr *eth = eth_hdr(skb);
2416
2417                                 if (ntohs(eth->h_proto) == ETH_P_IPV6 ||
2418                                     (ntohs(eth->h_proto) == ETH_P_IP &&
2419                                      old_iph->frag_off & htons(IP_DF)))
2420                                         df = htons(IP_DF);
2421                         }
2422                 } else if (info->key.tun_flags & TUNNEL_DONT_FRAGMENT) {
2423                         df = htons(IP_DF);
2424                 }
2425
2426                 ndst = &rt->dst;
2427                 skb_tunnel_check_pmtu(skb, ndst, VXLAN_HEADROOM);
2428
2429                 tos = ip_tunnel_ecn_encap(tos, old_iph, skb);
2430                 ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
2431                 err = vxlan_build_skb(skb, ndst, sizeof(struct iphdr),
2432                                       vni, md, flags, udp_sum);
2433                 if (err < 0)
2434                         goto tx_error;
2435
2436                 udp_tunnel_xmit_skb(rt, sock4->sock->sk, skb, local_ip.sin.sin_addr.s_addr,
2437                                     dst->sin.sin_addr.s_addr, tos, ttl, df,
2438                                     src_port, dst_port, xnet, !udp_sum);
2439 #if IS_ENABLED(CONFIG_IPV6)
2440         } else {
2441                 struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock);
2442
2443                 if (!ifindex)
2444                         ifindex = sock6->sock->sk->sk_bound_dev_if;
2445
2446                 ndst = vxlan6_get_route(vxlan, dev, sock6, skb, ifindex, tos,
2447                                         label, &dst->sin6.sin6_addr,
2448                                         &local_ip.sin6.sin6_addr,
2449                                         dst_port, src_port,
2450                                         dst_cache, info);
2451                 if (IS_ERR(ndst)) {
2452                         err = PTR_ERR(ndst);
2453                         ndst = NULL;
2454                         goto tx_error;
2455                 }
2456
2457                 if (!info) {
2458                         u32 rt6i_flags = ((struct rt6_info *)ndst)->rt6i_flags;
2459
2460                         err = encap_bypass_if_local(skb, dev, vxlan, dst,
2461                                                     dst_port, ifindex, vni,
2462                                                     ndst, rt6i_flags);
2463                         if (err)
2464                                 goto out_unlock;
2465                 }
2466
2467                 skb_tunnel_check_pmtu(skb, ndst, VXLAN6_HEADROOM);
2468
2469                 tos = ip_tunnel_ecn_encap(tos, old_iph, skb);
2470                 ttl = ttl ? : ip6_dst_hoplimit(ndst);
2471                 skb_scrub_packet(skb, xnet);
2472                 err = vxlan_build_skb(skb, ndst, sizeof(struct ipv6hdr),
2473                                       vni, md, flags, udp_sum);
2474                 if (err < 0)
2475                         goto tx_error;
2476
2477                 udp_tunnel6_xmit_skb(ndst, sock6->sock->sk, skb, dev,
2478                                      &local_ip.sin6.sin6_addr,
2479                                      &dst->sin6.sin6_addr, tos, ttl,
2480                                      label, src_port, dst_port, !udp_sum);
2481 #endif
2482         }
2483 out_unlock:
2484         rcu_read_unlock();
2485         return;
2486
2487 drop:
2488         dev->stats.tx_dropped++;
2489         dev_kfree_skb(skb);
2490         return;
2491
2492 tx_error:
2493         rcu_read_unlock();
2494         if (err == -ELOOP)
2495                 dev->stats.collisions++;
2496         else if (err == -ENETUNREACH)
2497                 dev->stats.tx_carrier_errors++;
2498         dst_release(ndst);
2499         dev->stats.tx_errors++;
2500         kfree_skb(skb);
2501 }
2502
2503 /* Transmit local packets over Vxlan
2504  *
2505  * Outer IP header inherits ECN and DF from inner header.
2506  * Outer UDP destination is the VXLAN assigned port.
2507  *           source port is based on hash of flow
2508  */
2509 static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
2510 {
2511         struct vxlan_dev *vxlan = netdev_priv(dev);
2512         struct vxlan_rdst *rdst, *fdst = NULL;
2513         const struct ip_tunnel_info *info;
2514         bool did_rsc = false;
2515         struct vxlan_fdb *f;
2516         struct ethhdr *eth;
2517         __be32 vni = 0;
2518
2519         info = skb_tunnel_info(skb);
2520
2521         skb_reset_mac_header(skb);
2522
2523         if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) {
2524                 if (info && info->mode & IP_TUNNEL_INFO_BRIDGE &&
2525                     info->mode & IP_TUNNEL_INFO_TX) {
2526                         vni = tunnel_id_to_key32(info->key.tun_id);
2527                 } else {
2528                         if (info && info->mode & IP_TUNNEL_INFO_TX)
2529                                 vxlan_xmit_one(skb, dev, vni, NULL, false);
2530                         else
2531                                 kfree_skb(skb);
2532                         return NETDEV_TX_OK;
2533                 }
2534         }
2535
2536         if (vxlan->cfg.flags & VXLAN_F_PROXY) {
2537                 eth = eth_hdr(skb);
2538                 if (ntohs(eth->h_proto) == ETH_P_ARP)
2539                         return arp_reduce(dev, skb, vni);
2540 #if IS_ENABLED(CONFIG_IPV6)
2541                 else if (ntohs(eth->h_proto) == ETH_P_IPV6 &&
2542                          pskb_may_pull(skb, sizeof(struct ipv6hdr) +
2543                                             sizeof(struct nd_msg)) &&
2544                          ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) {
2545                         struct nd_msg *m = (struct nd_msg *)(ipv6_hdr(skb) + 1);
2546
2547                         if (m->icmph.icmp6_code == 0 &&
2548                             m->icmph.icmp6_type == NDISC_NEIGHBOUR_SOLICITATION)
2549                                 return neigh_reduce(dev, skb, vni);
2550                 }
2551 #endif
2552         }
2553
2554         eth = eth_hdr(skb);
2555         f = vxlan_find_mac(vxlan, eth->h_dest, vni);
2556         did_rsc = false;
2557
2558         if (f && (f->flags & NTF_ROUTER) && (vxlan->cfg.flags & VXLAN_F_RSC) &&
2559             (ntohs(eth->h_proto) == ETH_P_IP ||
2560              ntohs(eth->h_proto) == ETH_P_IPV6)) {
2561                 did_rsc = route_shortcircuit(dev, skb);
2562                 if (did_rsc)
2563                         f = vxlan_find_mac(vxlan, eth->h_dest, vni);
2564         }
2565
2566         if (f == NULL) {
2567                 f = vxlan_find_mac(vxlan, all_zeros_mac, vni);
2568                 if (f == NULL) {
2569                         if ((vxlan->cfg.flags & VXLAN_F_L2MISS) &&
2570                             !is_multicast_ether_addr(eth->h_dest))
2571                                 vxlan_fdb_miss(vxlan, eth->h_dest);
2572
2573                         dev->stats.tx_dropped++;
2574                         kfree_skb(skb);
2575                         return NETDEV_TX_OK;
2576                 }
2577         }
2578
2579         list_for_each_entry_rcu(rdst, &f->remotes, list) {
2580                 struct sk_buff *skb1;
2581
2582                 if (!fdst) {
2583                         fdst = rdst;
2584                         continue;
2585                 }
2586                 skb1 = skb_clone(skb, GFP_ATOMIC);
2587                 if (skb1)
2588                         vxlan_xmit_one(skb1, dev, vni, rdst, did_rsc);
2589         }
2590
2591         if (fdst)
2592                 vxlan_xmit_one(skb, dev, vni, fdst, did_rsc);
2593         else
2594                 kfree_skb(skb);
2595         return NETDEV_TX_OK;
2596 }
2597
2598 /* Walk the forwarding table and purge stale entries */
2599 static void vxlan_cleanup(struct timer_list *t)
2600 {
2601         struct vxlan_dev *vxlan = from_timer(vxlan, t, age_timer);
2602         unsigned long next_timer = jiffies + FDB_AGE_INTERVAL;
2603         unsigned int h;
2604
2605         if (!netif_running(vxlan->dev))
2606                 return;
2607
2608         for (h = 0; h < FDB_HASH_SIZE; ++h) {
2609                 struct hlist_node *p, *n;
2610
2611                 spin_lock_bh(&vxlan->hash_lock);
2612                 hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) {
2613                         struct vxlan_fdb *f
2614                                 = container_of(p, struct vxlan_fdb, hlist);
2615                         unsigned long timeout;
2616
2617                         if (f->state & (NUD_PERMANENT | NUD_NOARP))
2618                                 continue;
2619
2620                         if (f->flags & NTF_EXT_LEARNED)
2621                                 continue;
2622
2623                         timeout = f->used + vxlan->cfg.age_interval * HZ;
2624                         if (time_before_eq(timeout, jiffies)) {
2625                                 netdev_dbg(vxlan->dev,
2626                                            "garbage collect %pM\n",
2627                                            f->eth_addr);
2628                                 f->state = NUD_STALE;
2629                                 vxlan_fdb_destroy(vxlan, f, true, true);
2630                         } else if (time_before(timeout, next_timer))
2631                                 next_timer = timeout;
2632                 }
2633                 spin_unlock_bh(&vxlan->hash_lock);
2634         }
2635
2636         mod_timer(&vxlan->age_timer, next_timer);
2637 }
2638
2639 static void vxlan_vs_del_dev(struct vxlan_dev *vxlan)
2640 {
2641         struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
2642
2643         spin_lock(&vn->sock_lock);
2644         hlist_del_init_rcu(&vxlan->hlist4.hlist);
2645 #if IS_ENABLED(CONFIG_IPV6)
2646         hlist_del_init_rcu(&vxlan->hlist6.hlist);
2647 #endif
2648         spin_unlock(&vn->sock_lock);
2649 }
2650
2651 static void vxlan_vs_add_dev(struct vxlan_sock *vs, struct vxlan_dev *vxlan,
2652                              struct vxlan_dev_node *node)
2653 {
2654         struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
2655         __be32 vni = vxlan->default_dst.remote_vni;
2656
2657         node->vxlan = vxlan;
2658         spin_lock(&vn->sock_lock);
2659         hlist_add_head_rcu(&node->hlist, vni_head(vs, vni));
2660         spin_unlock(&vn->sock_lock);
2661 }
2662
2663 /* Setup stats when device is created */
2664 static int vxlan_init(struct net_device *dev)
2665 {
2666         dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
2667         if (!dev->tstats)
2668                 return -ENOMEM;
2669
2670         return 0;
2671 }
2672
2673 static void vxlan_fdb_delete_default(struct vxlan_dev *vxlan, __be32 vni)
2674 {
2675         struct vxlan_fdb *f;
2676
2677         spin_lock_bh(&vxlan->hash_lock);
2678         f = __vxlan_find_mac(vxlan, all_zeros_mac, vni);
2679         if (f)
2680                 vxlan_fdb_destroy(vxlan, f, true, true);
2681         spin_unlock_bh(&vxlan->hash_lock);
2682 }
2683
2684 static void vxlan_uninit(struct net_device *dev)
2685 {
2686         struct vxlan_dev *vxlan = netdev_priv(dev);
2687
2688         vxlan_fdb_delete_default(vxlan, vxlan->cfg.vni);
2689
2690         free_percpu(dev->tstats);
2691 }
2692
2693 /* Start ageing timer and join group when device is brought up */
2694 static int vxlan_open(struct net_device *dev)
2695 {
2696         struct vxlan_dev *vxlan = netdev_priv(dev);
2697         int ret;
2698
2699         ret = vxlan_sock_add(vxlan);
2700         if (ret < 0)
2701                 return ret;
2702
2703         if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip)) {
2704                 ret = vxlan_igmp_join(vxlan);
2705                 if (ret == -EADDRINUSE)
2706                         ret = 0;
2707                 if (ret) {
2708                         vxlan_sock_release(vxlan);
2709                         return ret;
2710                 }
2711         }
2712
2713         if (vxlan->cfg.age_interval)
2714                 mod_timer(&vxlan->age_timer, jiffies + FDB_AGE_INTERVAL);
2715
2716         return ret;
2717 }
2718
2719 /* Purge the forwarding table */
2720 static void vxlan_flush(struct vxlan_dev *vxlan, bool do_all)
2721 {
2722         unsigned int h;
2723
2724         spin_lock_bh(&vxlan->hash_lock);
2725         for (h = 0; h < FDB_HASH_SIZE; ++h) {
2726                 struct hlist_node *p, *n;
2727                 hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) {
2728                         struct vxlan_fdb *f
2729                                 = container_of(p, struct vxlan_fdb, hlist);
2730                         if (!do_all && (f->state & (NUD_PERMANENT | NUD_NOARP)))
2731                                 continue;
2732                         /* the all_zeros_mac entry is deleted at vxlan_uninit */
2733                         if (!is_zero_ether_addr(f->eth_addr))
2734                                 vxlan_fdb_destroy(vxlan, f, true, true);
2735                 }
2736         }
2737         spin_unlock_bh(&vxlan->hash_lock);
2738 }
2739
2740 /* Cleanup timer and forwarding table on shutdown */
2741 static int vxlan_stop(struct net_device *dev)
2742 {
2743         struct vxlan_dev *vxlan = netdev_priv(dev);
2744         struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
2745         int ret = 0;
2746
2747         if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip) &&
2748             !vxlan_group_used(vn, vxlan))
2749                 ret = vxlan_igmp_leave(vxlan);
2750
2751         del_timer_sync(&vxlan->age_timer);
2752
2753         vxlan_flush(vxlan, false);
2754         vxlan_sock_release(vxlan);
2755
2756         return ret;
2757 }
2758
2759 /* Stub, nothing needs to be done. */
2760 static void vxlan_set_multicast_list(struct net_device *dev)
2761 {
2762 }
2763
2764 static int vxlan_change_mtu(struct net_device *dev, int new_mtu)
2765 {
2766         struct vxlan_dev *vxlan = netdev_priv(dev);
2767         struct vxlan_rdst *dst = &vxlan->default_dst;
2768         struct net_device *lowerdev = __dev_get_by_index(vxlan->net,
2769                                                          dst->remote_ifindex);
2770         bool use_ipv6 = !!(vxlan->cfg.flags & VXLAN_F_IPV6);
2771
2772         /* This check is different than dev->max_mtu, because it looks at
2773          * the lowerdev->mtu, rather than the static dev->max_mtu
2774          */
2775         if (lowerdev) {
2776                 int max_mtu = lowerdev->mtu -
2777                               (use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM);
2778                 if (new_mtu > max_mtu)
2779                         return -EINVAL;
2780         }
2781
2782         dev->mtu = new_mtu;
2783         return 0;
2784 }
2785
2786 static int vxlan_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
2787 {
2788         struct vxlan_dev *vxlan = netdev_priv(dev);
2789         struct ip_tunnel_info *info = skb_tunnel_info(skb);
2790         __be16 sport, dport;
2791
2792         sport = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min,
2793                                   vxlan->cfg.port_max, true);
2794         dport = info->key.tp_dst ? : vxlan->cfg.dst_port;
2795
2796         if (ip_tunnel_info_af(info) == AF_INET) {
2797                 struct vxlan_sock *sock4 = rcu_dereference(vxlan->vn4_sock);
2798                 struct rtable *rt;
2799
2800                 rt = vxlan_get_route(vxlan, dev, sock4, skb, 0, info->key.tos,
2801                                      info->key.u.ipv4.dst,
2802                                      &info->key.u.ipv4.src, dport, sport,
2803                                      &info->dst_cache, info);
2804                 if (IS_ERR(rt))
2805                         return PTR_ERR(rt);
2806                 ip_rt_put(rt);
2807         } else {
2808 #if IS_ENABLED(CONFIG_IPV6)
2809                 struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock);
2810                 struct dst_entry *ndst;
2811
2812                 ndst = vxlan6_get_route(vxlan, dev, sock6, skb, 0, info->key.tos,
2813                                         info->key.label, &info->key.u.ipv6.dst,
2814                                         &info->key.u.ipv6.src, dport, sport,
2815                                         &info->dst_cache, info);
2816                 if (IS_ERR(ndst))
2817                         return PTR_ERR(ndst);
2818                 dst_release(ndst);
2819 #else /* !CONFIG_IPV6 */
2820                 return -EPFNOSUPPORT;
2821 #endif
2822         }
2823         info->key.tp_src = sport;
2824         info->key.tp_dst = dport;
2825         return 0;
2826 }
2827
2828 static const struct net_device_ops vxlan_netdev_ether_ops = {
2829         .ndo_init               = vxlan_init,
2830         .ndo_uninit             = vxlan_uninit,
2831         .ndo_open               = vxlan_open,
2832         .ndo_stop               = vxlan_stop,
2833         .ndo_start_xmit         = vxlan_xmit,
2834         .ndo_get_stats64        = ip_tunnel_get_stats64,
2835         .ndo_set_rx_mode        = vxlan_set_multicast_list,
2836         .ndo_change_mtu         = vxlan_change_mtu,
2837         .ndo_validate_addr      = eth_validate_addr,
2838         .ndo_set_mac_address    = eth_mac_addr,
2839         .ndo_fdb_add            = vxlan_fdb_add,
2840         .ndo_fdb_del            = vxlan_fdb_delete,
2841         .ndo_fdb_dump           = vxlan_fdb_dump,
2842         .ndo_fdb_get            = vxlan_fdb_get,
2843         .ndo_fill_metadata_dst  = vxlan_fill_metadata_dst,
2844 };
2845
2846 static const struct net_device_ops vxlan_netdev_raw_ops = {
2847         .ndo_init               = vxlan_init,
2848         .ndo_uninit             = vxlan_uninit,
2849         .ndo_open               = vxlan_open,
2850         .ndo_stop               = vxlan_stop,
2851         .ndo_start_xmit         = vxlan_xmit,
2852         .ndo_get_stats64        = ip_tunnel_get_stats64,
2853         .ndo_change_mtu         = vxlan_change_mtu,
2854         .ndo_fill_metadata_dst  = vxlan_fill_metadata_dst,
2855 };
2856
2857 /* Info for udev, that this is a virtual tunnel endpoint */
2858 static struct device_type vxlan_type = {
2859         .name = "vxlan",
2860 };
2861
2862 /* Calls the ndo_udp_tunnel_add of the caller in order to
2863  * supply the listening VXLAN udp ports. Callers are expected
2864  * to implement the ndo_udp_tunnel_add.
2865  */
2866 static void vxlan_offload_rx_ports(struct net_device *dev, bool push)
2867 {
2868         struct vxlan_sock *vs;
2869         struct net *net = dev_net(dev);
2870         struct vxlan_net *vn = net_generic(net, vxlan_net_id);
2871         unsigned int i;
2872
2873         spin_lock(&vn->sock_lock);
2874         for (i = 0; i < PORT_HASH_SIZE; ++i) {
2875                 hlist_for_each_entry_rcu(vs, &vn->sock_list[i], hlist) {
2876                         unsigned short type;
2877
2878                         if (vs->flags & VXLAN_F_GPE)
2879                                 type = UDP_TUNNEL_TYPE_VXLAN_GPE;
2880                         else
2881                                 type = UDP_TUNNEL_TYPE_VXLAN;
2882
2883                         if (push)
2884                                 udp_tunnel_push_rx_port(dev, vs->sock, type);
2885                         else
2886                                 udp_tunnel_drop_rx_port(dev, vs->sock, type);
2887                 }
2888         }
2889         spin_unlock(&vn->sock_lock);
2890 }
2891
2892 /* Initialize the device structure. */
2893 static void vxlan_setup(struct net_device *dev)
2894 {
2895         struct vxlan_dev *vxlan = netdev_priv(dev);
2896         unsigned int h;
2897
2898         eth_hw_addr_random(dev);
2899         ether_setup(dev);
2900
2901         dev->needs_free_netdev = true;
2902         SET_NETDEV_DEVTYPE(dev, &vxlan_type);
2903
2904         dev->features   |= NETIF_F_LLTX;
2905         dev->features   |= NETIF_F_SG | NETIF_F_HW_CSUM;
2906         dev->features   |= NETIF_F_RXCSUM;
2907         dev->features   |= NETIF_F_GSO_SOFTWARE;
2908
2909         dev->vlan_features = dev->features;
2910         dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM;
2911         dev->hw_features |= NETIF_F_GSO_SOFTWARE;
2912         netif_keep_dst(dev);
2913         dev->priv_flags |= IFF_NO_QUEUE;
2914
2915         /* MTU range: 68 - 65535 */
2916         dev->min_mtu = ETH_MIN_MTU;
2917         dev->max_mtu = ETH_MAX_MTU;
2918
2919         INIT_LIST_HEAD(&vxlan->next);
2920         spin_lock_init(&vxlan->hash_lock);
2921
2922         timer_setup(&vxlan->age_timer, vxlan_cleanup, TIMER_DEFERRABLE);
2923
2924         vxlan->dev = dev;
2925
2926         gro_cells_init(&vxlan->gro_cells, dev);
2927
2928         for (h = 0; h < FDB_HASH_SIZE; ++h)
2929                 INIT_HLIST_HEAD(&vxlan->fdb_head[h]);
2930 }
2931
2932 static void vxlan_ether_setup(struct net_device *dev)
2933 {
2934         dev->priv_flags &= ~IFF_TX_SKB_SHARING;
2935         dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
2936         dev->netdev_ops = &vxlan_netdev_ether_ops;
2937 }
2938
2939 static void vxlan_raw_setup(struct net_device *dev)
2940 {
2941         dev->header_ops = NULL;
2942         dev->type = ARPHRD_NONE;
2943         dev->hard_header_len = 0;
2944         dev->addr_len = 0;
2945         dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST;
2946         dev->netdev_ops = &vxlan_netdev_raw_ops;
2947 }
2948
2949 static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
2950         [IFLA_VXLAN_ID]         = { .type = NLA_U32 },
2951         [IFLA_VXLAN_GROUP]      = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
2952         [IFLA_VXLAN_GROUP6]     = { .len = sizeof(struct in6_addr) },
2953         [IFLA_VXLAN_LINK]       = { .type = NLA_U32 },
2954         [IFLA_VXLAN_LOCAL]      = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
2955         [IFLA_VXLAN_LOCAL6]     = { .len = sizeof(struct in6_addr) },
2956         [IFLA_VXLAN_TOS]        = { .type = NLA_U8 },
2957         [IFLA_VXLAN_TTL]        = { .type = NLA_U8 },
2958         [IFLA_VXLAN_LABEL]      = { .type = NLA_U32 },
2959         [IFLA_VXLAN_LEARNING]   = { .type = NLA_U8 },
2960         [IFLA_VXLAN_AGEING]     = { .type = NLA_U32 },
2961         [IFLA_VXLAN_LIMIT]      = { .type = NLA_U32 },
2962         [IFLA_VXLAN_PORT_RANGE] = { .len  = sizeof(struct ifla_vxlan_port_range) },
2963         [IFLA_VXLAN_PROXY]      = { .type = NLA_U8 },
2964         [IFLA_VXLAN_RSC]        = { .type = NLA_U8 },
2965         [IFLA_VXLAN_L2MISS]     = { .type = NLA_U8 },
2966         [IFLA_VXLAN_L3MISS]     = { .type = NLA_U8 },
2967         [IFLA_VXLAN_COLLECT_METADATA]   = { .type = NLA_U8 },
2968         [IFLA_VXLAN_PORT]       = { .type = NLA_U16 },
2969         [IFLA_VXLAN_UDP_CSUM]   = { .type = NLA_U8 },
2970         [IFLA_VXLAN_UDP_ZERO_CSUM6_TX]  = { .type = NLA_U8 },
2971         [IFLA_VXLAN_UDP_ZERO_CSUM6_RX]  = { .type = NLA_U8 },
2972         [IFLA_VXLAN_REMCSUM_TX] = { .type = NLA_U8 },
2973         [IFLA_VXLAN_REMCSUM_RX] = { .type = NLA_U8 },
2974         [IFLA_VXLAN_GBP]        = { .type = NLA_FLAG, },
2975         [IFLA_VXLAN_GPE]        = { .type = NLA_FLAG, },
2976         [IFLA_VXLAN_REMCSUM_NOPARTIAL]  = { .type = NLA_FLAG },
2977         [IFLA_VXLAN_TTL_INHERIT]        = { .type = NLA_FLAG },
2978         [IFLA_VXLAN_DF]         = { .type = NLA_U8 },
2979 };
2980
2981 static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[],
2982                           struct netlink_ext_ack *extack)
2983 {
2984         if (tb[IFLA_ADDRESS]) {
2985                 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) {
2986                         NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_ADDRESS],
2987                                             "Provided link layer address is not Ethernet");
2988                         return -EINVAL;
2989                 }
2990
2991                 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) {
2992                         NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_ADDRESS],
2993                                             "Provided Ethernet address is not unicast");
2994                         return -EADDRNOTAVAIL;
2995                 }
2996         }
2997
2998         if (tb[IFLA_MTU]) {
2999                 u32 mtu = nla_get_u32(tb[IFLA_MTU]);
3000
3001                 if (mtu < ETH_MIN_MTU || mtu > ETH_MAX_MTU) {
3002                         NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_MTU],
3003                                             "MTU must be between 68 and 65535");
3004                         return -EINVAL;
3005                 }
3006         }
3007
3008         if (!data) {
3009                 NL_SET_ERR_MSG(extack,
3010                                "Required attributes not provided to perform the operation");
3011                 return -EINVAL;
3012         }
3013
3014         if (data[IFLA_VXLAN_ID]) {
3015                 u32 id = nla_get_u32(data[IFLA_VXLAN_ID]);
3016
3017                 if (id >= VXLAN_N_VID) {
3018                         NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_ID],
3019                                             "VXLAN ID must be lower than 16777216");
3020                         return -ERANGE;
3021                 }
3022         }
3023
3024         if (data[IFLA_VXLAN_PORT_RANGE]) {
3025                 const struct ifla_vxlan_port_range *p
3026                         = nla_data(data[IFLA_VXLAN_PORT_RANGE]);
3027
3028                 if (ntohs(p->high) < ntohs(p->low)) {
3029                         NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_PORT_RANGE],
3030                                             "Invalid source port range");
3031                         return -EINVAL;
3032                 }
3033         }
3034
3035         if (data[IFLA_VXLAN_DF]) {
3036                 enum ifla_vxlan_df df = nla_get_u8(data[IFLA_VXLAN_DF]);
3037
3038                 if (df < 0 || df > VXLAN_DF_MAX) {
3039                         NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_DF],
3040                                             "Invalid DF attribute");
3041                         return -EINVAL;
3042                 }
3043         }
3044
3045         return 0;
3046 }
3047
3048 static void vxlan_get_drvinfo(struct net_device *netdev,
3049                               struct ethtool_drvinfo *drvinfo)
3050 {
3051         strlcpy(drvinfo->version, VXLAN_VERSION, sizeof(drvinfo->version));
3052         strlcpy(drvinfo->driver, "vxlan", sizeof(drvinfo->driver));
3053 }
3054
3055 static const struct ethtool_ops vxlan_ethtool_ops = {
3056         .get_drvinfo    = vxlan_get_drvinfo,
3057         .get_link       = ethtool_op_get_link,
3058 };
3059
3060 static struct socket *vxlan_create_sock(struct net *net, bool ipv6,
3061                                         __be16 port, u32 flags, int ifindex)
3062 {
3063         struct socket *sock;
3064         struct udp_port_cfg udp_conf;
3065         int err;
3066
3067         memset(&udp_conf, 0, sizeof(udp_conf));
3068
3069         if (ipv6) {
3070                 udp_conf.family = AF_INET6;
3071                 udp_conf.use_udp6_rx_checksums =
3072                     !(flags & VXLAN_F_UDP_ZERO_CSUM6_RX);
3073                 udp_conf.ipv6_v6only = 1;
3074         } else {
3075                 udp_conf.family = AF_INET;
3076         }
3077
3078         udp_conf.local_udp_port = port;
3079         udp_conf.bind_ifindex = ifindex;
3080
3081         /* Open UDP socket */
3082         err = udp_sock_create(net, &udp_conf, &sock);
3083         if (err < 0)
3084                 return ERR_PTR(err);
3085
3086         return sock;
3087 }
3088
3089 /* Create new listen socket if needed */
3090 static struct vxlan_sock *vxlan_socket_create(struct net *net, bool ipv6,
3091                                               __be16 port, u32 flags,
3092                                               int ifindex)
3093 {
3094         struct vxlan_net *vn = net_generic(net, vxlan_net_id);
3095         struct vxlan_sock *vs;
3096         struct socket *sock;
3097         unsigned int h;
3098         struct udp_tunnel_sock_cfg tunnel_cfg;
3099
3100         vs = kzalloc(sizeof(*vs), GFP_KERNEL);
3101         if (!vs)
3102                 return ERR_PTR(-ENOMEM);
3103
3104         for (h = 0; h < VNI_HASH_SIZE; ++h)
3105                 INIT_HLIST_HEAD(&vs->vni_list[h]);
3106
3107         sock = vxlan_create_sock(net, ipv6, port, flags, ifindex);
3108         if (IS_ERR(sock)) {
3109                 kfree(vs);
3110                 return ERR_CAST(sock);
3111         }
3112
3113         vs->sock = sock;
3114         refcount_set(&vs->refcnt, 1);
3115         vs->flags = (flags & VXLAN_F_RCV_FLAGS);
3116
3117         spin_lock(&vn->sock_lock);
3118         hlist_add_head_rcu(&vs->hlist, vs_head(net, port));
3119         udp_tunnel_notify_add_rx_port(sock,
3120                                       (vs->flags & VXLAN_F_GPE) ?
3121                                       UDP_TUNNEL_TYPE_VXLAN_GPE :
3122                                       UDP_TUNNEL_TYPE_VXLAN);
3123         spin_unlock(&vn->sock_lock);
3124
3125         /* Mark socket as an encapsulation socket. */
3126         memset(&tunnel_cfg, 0, sizeof(tunnel_cfg));
3127         tunnel_cfg.sk_user_data = vs;
3128         tunnel_cfg.encap_type = 1;
3129         tunnel_cfg.encap_rcv = vxlan_rcv;
3130         tunnel_cfg.encap_err_lookup = vxlan_err_lookup;
3131         tunnel_cfg.encap_destroy = NULL;
3132         tunnel_cfg.gro_receive = vxlan_gro_receive;
3133         tunnel_cfg.gro_complete = vxlan_gro_complete;
3134
3135         setup_udp_tunnel_sock(net, sock, &tunnel_cfg);
3136
3137         return vs;
3138 }
3139
3140 static int __vxlan_sock_add(struct vxlan_dev *vxlan, bool ipv6)
3141 {
3142         struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
3143         struct vxlan_sock *vs = NULL;
3144         struct vxlan_dev_node *node;
3145         int l3mdev_index = 0;
3146
3147         if (vxlan->cfg.remote_ifindex)
3148                 l3mdev_index = l3mdev_master_upper_ifindex_by_index(
3149                         vxlan->net, vxlan->cfg.remote_ifindex);
3150
3151         if (!vxlan->cfg.no_share) {
3152                 spin_lock(&vn->sock_lock);
3153                 vs = vxlan_find_sock(vxlan->net, ipv6 ? AF_INET6 : AF_INET,
3154                                      vxlan->cfg.dst_port, vxlan->cfg.flags,
3155                                      l3mdev_index);
3156                 if (vs && !refcount_inc_not_zero(&vs->refcnt)) {
3157                         spin_unlock(&vn->sock_lock);
3158                         return -EBUSY;
3159                 }
3160                 spin_unlock(&vn->sock_lock);
3161         }
3162         if (!vs)
3163                 vs = vxlan_socket_create(vxlan->net, ipv6,
3164                                          vxlan->cfg.dst_port, vxlan->cfg.flags,
3165                                          l3mdev_index);
3166         if (IS_ERR(vs))
3167                 return PTR_ERR(vs);
3168 #if IS_ENABLED(CONFIG_IPV6)
3169         if (ipv6) {
3170                 rcu_assign_pointer(vxlan->vn6_sock, vs);
3171                 node = &vxlan->hlist6;
3172         } else
3173 #endif
3174         {
3175                 rcu_assign_pointer(vxlan->vn4_sock, vs);
3176                 node = &vxlan->hlist4;
3177         }
3178         vxlan_vs_add_dev(vs, vxlan, node);
3179         return 0;
3180 }
3181
3182 static int vxlan_sock_add(struct vxlan_dev *vxlan)
3183 {
3184         bool metadata = vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA;
3185         bool ipv6 = vxlan->cfg.flags & VXLAN_F_IPV6 || metadata;
3186         bool ipv4 = !ipv6 || metadata;
3187         int ret = 0;
3188
3189         RCU_INIT_POINTER(vxlan->vn4_sock, NULL);
3190 #if IS_ENABLED(CONFIG_IPV6)
3191         RCU_INIT_POINTER(vxlan->vn6_sock, NULL);
3192         if (ipv6) {
3193                 ret = __vxlan_sock_add(vxlan, true);
3194                 if (ret < 0 && ret != -EAFNOSUPPORT)
3195                         ipv4 = false;
3196         }
3197 #endif
3198         if (ipv4)
3199                 ret = __vxlan_sock_add(vxlan, false);
3200         if (ret < 0)
3201                 vxlan_sock_release(vxlan);
3202         return ret;
3203 }
3204
3205 static int vxlan_config_validate(struct net *src_net, struct vxlan_config *conf,
3206                                  struct net_device **lower,
3207                                  struct vxlan_dev *old,
3208                                  struct netlink_ext_ack *extack)
3209 {
3210         struct vxlan_net *vn = net_generic(src_net, vxlan_net_id);
3211         struct vxlan_dev *tmp;
3212         bool use_ipv6 = false;
3213
3214         if (conf->flags & VXLAN_F_GPE) {
3215                 /* For now, allow GPE only together with
3216                  * COLLECT_METADATA. This can be relaxed later; in such
3217                  * case, the other side of the PtP link will have to be
3218                  * provided.
3219                  */
3220                 if ((conf->flags & ~VXLAN_F_ALLOWED_GPE) ||
3221                     !(conf->flags & VXLAN_F_COLLECT_METADATA)) {
3222                         NL_SET_ERR_MSG(extack,
3223                                        "VXLAN GPE does not support this combination of attributes");
3224                         return -EINVAL;
3225                 }
3226         }
3227
3228         if (!conf->remote_ip.sa.sa_family && !conf->saddr.sa.sa_family) {
3229                 /* Unless IPv6 is explicitly requested, assume IPv4 */
3230                 conf->remote_ip.sa.sa_family = AF_INET;
3231                 conf->saddr.sa.sa_family = AF_INET;
3232         } else if (!conf->remote_ip.sa.sa_family) {
3233                 conf->remote_ip.sa.sa_family = conf->saddr.sa.sa_family;
3234         } else if (!conf->saddr.sa.sa_family) {
3235                 conf->saddr.sa.sa_family = conf->remote_ip.sa.sa_family;
3236         }
3237
3238         if (conf->saddr.sa.sa_family != conf->remote_ip.sa.sa_family) {
3239                 NL_SET_ERR_MSG(extack,
3240                                "Local and remote address must be from the same family");
3241                 return -EINVAL;
3242         }
3243
3244         if (vxlan_addr_multicast(&conf->saddr)) {
3245                 NL_SET_ERR_MSG(extack, "Local address cannot be multicast");
3246                 return -EINVAL;
3247         }
3248
3249         if (conf->saddr.sa.sa_family == AF_INET6) {
3250                 if (!IS_ENABLED(CONFIG_IPV6)) {
3251                         NL_SET_ERR_MSG(extack,
3252                                        "IPv6 support not enabled in the kernel");
3253                         return -EPFNOSUPPORT;
3254                 }
3255                 use_ipv6 = true;
3256                 conf->flags |= VXLAN_F_IPV6;
3257
3258                 if (!(conf->flags & VXLAN_F_COLLECT_METADATA)) {
3259                         int local_type =
3260                                 ipv6_addr_type(&conf->saddr.sin6.sin6_addr);
3261                         int remote_type =
3262                                 ipv6_addr_type(&conf->remote_ip.sin6.sin6_addr);
3263
3264                         if (local_type & IPV6_ADDR_LINKLOCAL) {
3265                                 if (!(remote_type & IPV6_ADDR_LINKLOCAL) &&
3266                                     (remote_type != IPV6_ADDR_ANY)) {
3267                                         NL_SET_ERR_MSG(extack,
3268                                                        "Invalid combination of local and remote address scopes");
3269                                         return -EINVAL;
3270                                 }
3271
3272                                 conf->flags |= VXLAN_F_IPV6_LINKLOCAL;
3273                         } else {
3274                                 if (remote_type ==
3275                                     (IPV6_ADDR_UNICAST | IPV6_ADDR_LINKLOCAL)) {
3276                                         NL_SET_ERR_MSG(extack,
3277                                                        "Invalid combination of local and remote address scopes");
3278                                         return -EINVAL;
3279                                 }
3280
3281                                 conf->flags &= ~VXLAN_F_IPV6_LINKLOCAL;
3282                         }
3283                 }
3284         }
3285
3286         if (conf->label && !use_ipv6) {
3287                 NL_SET_ERR_MSG(extack,
3288                                "Label attribute only applies to IPv6 VXLAN devices");
3289                 return -EINVAL;
3290         }
3291
3292         if (conf->remote_ifindex) {
3293                 struct net_device *lowerdev;
3294
3295                 lowerdev = __dev_get_by_index(src_net, conf->remote_ifindex);
3296                 if (!lowerdev) {
3297                         NL_SET_ERR_MSG(extack,
3298                                        "Invalid local interface, device not found");
3299                         return -ENODEV;
3300                 }
3301
3302 #if IS_ENABLED(CONFIG_IPV6)
3303                 if (use_ipv6) {
3304                         struct inet6_dev *idev = __in6_dev_get(lowerdev);
3305                         if (idev && idev->cnf.disable_ipv6) {
3306                                 NL_SET_ERR_MSG(extack,
3307                                                "IPv6 support disabled by administrator");
3308                                 return -EPERM;
3309                         }
3310                 }
3311 #endif
3312
3313                 *lower = lowerdev;
3314         } else {
3315                 if (vxlan_addr_multicast(&conf->remote_ip)) {
3316                         NL_SET_ERR_MSG(extack,
3317                                        "Local interface required for multicast remote destination");
3318
3319                         return -EINVAL;
3320                 }
3321
3322 #if IS_ENABLED(CONFIG_IPV6)
3323                 if (conf->flags & VXLAN_F_IPV6_LINKLOCAL) {
3324                         NL_SET_ERR_MSG(extack,
3325                                        "Local interface required for link-local local/remote addresses");
3326                         return -EINVAL;
3327                 }
3328 #endif
3329
3330                 *lower = NULL;
3331         }
3332
3333         if (!conf->dst_port) {
3334                 if (conf->flags & VXLAN_F_GPE)
3335                         conf->dst_port = htons(4790); /* IANA VXLAN-GPE port */
3336                 else
3337                         conf->dst_port = htons(vxlan_port);
3338         }
3339
3340         if (!conf->age_interval)
3341                 conf->age_interval = FDB_AGE_DEFAULT;
3342
3343         list_for_each_entry(tmp, &vn->vxlan_list, next) {
3344                 if (tmp == old)
3345                         continue;
3346
3347                 if (tmp->cfg.vni != conf->vni)
3348                         continue;
3349                 if (tmp->cfg.dst_port != conf->dst_port)
3350                         continue;
3351                 if ((tmp->cfg.flags & (VXLAN_F_RCV_FLAGS | VXLAN_F_IPV6)) !=
3352                     (conf->flags & (VXLAN_F_RCV_FLAGS | VXLAN_F_IPV6)))
3353                         continue;
3354
3355                 if ((conf->flags & VXLAN_F_IPV6_LINKLOCAL) &&
3356                     tmp->cfg.remote_ifindex != conf->remote_ifindex)
3357                         continue;
3358
3359                 NL_SET_ERR_MSG(extack,
3360                                "A VXLAN device with the specified VNI already exists");
3361                 return -EEXIST;
3362         }
3363
3364         return 0;
3365 }
3366
3367 static void vxlan_config_apply(struct net_device *dev,
3368                                struct vxlan_config *conf,
3369                                struct net_device *lowerdev,
3370                                struct net *src_net,
3371                                bool changelink)
3372 {
3373         struct vxlan_dev *vxlan = netdev_priv(dev);
3374         struct vxlan_rdst *dst = &vxlan->default_dst;
3375         unsigned short needed_headroom = ETH_HLEN;
3376         bool use_ipv6 = !!(conf->flags & VXLAN_F_IPV6);
3377         int max_mtu = ETH_MAX_MTU;
3378
3379         if (!changelink) {
3380                 if (conf->flags & VXLAN_F_GPE)
3381                         vxlan_raw_setup(dev);
3382                 else
3383                         vxlan_ether_setup(dev);
3384
3385                 if (conf->mtu)
3386                         dev->mtu = conf->mtu;
3387
3388                 vxlan->net = src_net;
3389         }
3390
3391         dst->remote_vni = conf->vni;
3392
3393         memcpy(&dst->remote_ip, &conf->remote_ip, sizeof(conf->remote_ip));
3394
3395         if (lowerdev) {
3396                 dst->remote_ifindex = conf->remote_ifindex;
3397
3398                 dev->gso_max_size = lowerdev->gso_max_size;
3399                 dev->gso_max_segs = lowerdev->gso_max_segs;
3400
3401                 needed_headroom = lowerdev->hard_header_len;
3402
3403                 max_mtu = lowerdev->mtu - (use_ipv6 ? VXLAN6_HEADROOM :
3404                                            VXLAN_HEADROOM);
3405                 if (max_mtu < ETH_MIN_MTU)
3406                         max_mtu = ETH_MIN_MTU;
3407
3408                 if (!changelink && !conf->mtu)
3409                         dev->mtu = max_mtu;
3410         }
3411
3412         if (dev->mtu > max_mtu)
3413                 dev->mtu = max_mtu;
3414
3415         if (use_ipv6 || conf->flags & VXLAN_F_COLLECT_METADATA)
3416                 needed_headroom += VXLAN6_HEADROOM;
3417         else
3418                 needed_headroom += VXLAN_HEADROOM;
3419         dev->needed_headroom = needed_headroom;
3420
3421         memcpy(&vxlan->cfg, conf, sizeof(*conf));
3422 }
3423
3424 static int vxlan_dev_configure(struct net *src_net, struct net_device *dev,
3425                                struct vxlan_config *conf, bool changelink,
3426                                struct netlink_ext_ack *extack)
3427 {
3428         struct vxlan_dev *vxlan = netdev_priv(dev);
3429         struct net_device *lowerdev;
3430         int ret;
3431
3432         ret = vxlan_config_validate(src_net, conf, &lowerdev, vxlan, extack);
3433         if (ret)
3434                 return ret;
3435
3436         vxlan_config_apply(dev, conf, lowerdev, src_net, changelink);
3437
3438         return 0;
3439 }
3440
3441 static int __vxlan_dev_create(struct net *net, struct net_device *dev,
3442                               struct vxlan_config *conf,
3443                               struct netlink_ext_ack *extack)
3444 {
3445         struct vxlan_net *vn = net_generic(net, vxlan_net_id);
3446         struct vxlan_dev *vxlan = netdev_priv(dev);
3447         struct vxlan_fdb *f = NULL;
3448         bool unregister = false;
3449         int err;
3450
3451         err = vxlan_dev_configure(net, dev, conf, false, extack);
3452         if (err)
3453                 return err;
3454
3455         dev->ethtool_ops = &vxlan_ethtool_ops;
3456
3457         /* create an fdb entry for a valid default destination */
3458         if (!vxlan_addr_any(&vxlan->default_dst.remote_ip)) {
3459                 err = vxlan_fdb_create(vxlan, all_zeros_mac,
3460                                        &vxlan->default_dst.remote_ip,
3461                                        NUD_REACHABLE | NUD_PERMANENT,
3462                                        vxlan->cfg.dst_port,
3463                                        vxlan->default_dst.remote_vni,
3464                                        vxlan->default_dst.remote_vni,
3465                                        vxlan->default_dst.remote_ifindex,
3466                                        NTF_SELF, &f);
3467                 if (err)
3468                         return err;
3469         }
3470
3471         err = register_netdevice(dev);
3472         if (err)
3473                 goto errout;
3474         unregister = true;
3475
3476         err = rtnl_configure_link(dev, NULL);
3477         if (err)
3478                 goto errout;
3479
3480         /* notify default fdb entry */
3481         if (f)
3482                 vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f), RTM_NEWNEIGH,
3483                                  true);
3484
3485         list_add(&vxlan->next, &vn->vxlan_list);
3486         return 0;
3487
3488 errout:
3489         /* unregister_netdevice() destroys the default FDB entry with deletion
3490          * notification. But the addition notification was not sent yet, so
3491          * destroy the entry by hand here.
3492          */
3493         if (f)
3494                 vxlan_fdb_destroy(vxlan, f, false, false);
3495         if (unregister)
3496                 unregister_netdevice(dev);
3497         return err;
3498 }
3499
3500 static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[],
3501                          struct net_device *dev, struct vxlan_config *conf,
3502                          bool changelink)
3503 {
3504         struct vxlan_dev *vxlan = netdev_priv(dev);
3505
3506         memset(conf, 0, sizeof(*conf));
3507
3508         /* if changelink operation, start with old existing cfg */
3509         if (changelink)
3510                 memcpy(conf, &vxlan->cfg, sizeof(*conf));
3511
3512         if (data[IFLA_VXLAN_ID]) {
3513                 __be32 vni = cpu_to_be32(nla_get_u32(data[IFLA_VXLAN_ID]));
3514
3515                 if (changelink && (vni != conf->vni))
3516                         return -EOPNOTSUPP;
3517                 conf->vni = cpu_to_be32(nla_get_u32(data[IFLA_VXLAN_ID]));
3518         }
3519
3520         if (data[IFLA_VXLAN_GROUP]) {
3521                 if (changelink && (conf->remote_ip.sa.sa_family != AF_INET))
3522                         return -EOPNOTSUPP;
3523
3524                 conf->remote_ip.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_GROUP]);
3525                 conf->remote_ip.sa.sa_family = AF_INET;
3526         } else if (data[IFLA_VXLAN_GROUP6]) {
3527                 if (!IS_ENABLED(CONFIG_IPV6))
3528                         return -EPFNOSUPPORT;
3529
3530                 if (changelink && (conf->remote_ip.sa.sa_family != AF_INET6))
3531                         return -EOPNOTSUPP;
3532
3533                 conf->remote_ip.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_GROUP6]);
3534                 conf->remote_ip.sa.sa_family = AF_INET6;
3535         }
3536
3537         if (data[IFLA_VXLAN_LOCAL]) {
3538                 if (changelink && (conf->saddr.sa.sa_family != AF_INET))
3539                         return -EOPNOTSUPP;
3540
3541                 conf->saddr.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_LOCAL]);
3542                 conf->saddr.sa.sa_family = AF_INET;
3543         } else if (data[IFLA_VXLAN_LOCAL6]) {
3544                 if (!IS_ENABLED(CONFIG_IPV6))
3545                         return -EPFNOSUPPORT;
3546
3547                 if (changelink && (conf->saddr.sa.sa_family != AF_INET6))
3548                         return -EOPNOTSUPP;
3549
3550                 /* TODO: respect scope id */
3551                 conf->saddr.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_LOCAL6]);
3552                 conf->saddr.sa.sa_family = AF_INET6;
3553         }
3554
3555         if (data[IFLA_VXLAN_LINK])
3556                 conf->remote_ifindex = nla_get_u32(data[IFLA_VXLAN_LINK]);
3557
3558         if (data[IFLA_VXLAN_TOS])
3559                 conf->tos  = nla_get_u8(data[IFLA_VXLAN_TOS]);
3560
3561         if (data[IFLA_VXLAN_TTL])
3562                 conf->ttl = nla_get_u8(data[IFLA_VXLAN_TTL]);
3563
3564         if (data[IFLA_VXLAN_TTL_INHERIT]) {
3565                 if (changelink)
3566                         return -EOPNOTSUPP;
3567                 conf->flags |= VXLAN_F_TTL_INHERIT;
3568         }
3569
3570         if (data[IFLA_VXLAN_LABEL])
3571                 conf->label = nla_get_be32(data[IFLA_VXLAN_LABEL]) &
3572                              IPV6_FLOWLABEL_MASK;
3573
3574         if (data[IFLA_VXLAN_LEARNING]) {
3575                 if (nla_get_u8(data[IFLA_VXLAN_LEARNING]))
3576                         conf->flags |= VXLAN_F_LEARN;
3577                 else
3578                         conf->flags &= ~VXLAN_F_LEARN;
3579         } else if (!changelink) {
3580                 /* default to learn on a new device */
3581                 conf->flags |= VXLAN_F_LEARN;
3582         }
3583
3584         if (data[IFLA_VXLAN_AGEING])
3585                 conf->age_interval = nla_get_u32(data[IFLA_VXLAN_AGEING]);
3586
3587         if (data[IFLA_VXLAN_PROXY]) {
3588                 if (changelink)
3589                         return -EOPNOTSUPP;
3590                 if (nla_get_u8(data[IFLA_VXLAN_PROXY]))
3591                         conf->flags |= VXLAN_F_PROXY;
3592         }
3593
3594         if (data[IFLA_VXLAN_RSC]) {
3595                 if (changelink)
3596                         return -EOPNOTSUPP;
3597                 if (nla_get_u8(data[IFLA_VXLAN_RSC]))
3598                         conf->flags |= VXLAN_F_RSC;
3599         }
3600
3601         if (data[IFLA_VXLAN_L2MISS]) {
3602                 if (changelink)
3603                         return -EOPNOTSUPP;
3604                 if (nla_get_u8(data[IFLA_VXLAN_L2MISS]))
3605                         conf->flags |= VXLAN_F_L2MISS;
3606         }
3607
3608         if (data[IFLA_VXLAN_L3MISS]) {
3609                 if (changelink)
3610                         return -EOPNOTSUPP;
3611                 if (nla_get_u8(data[IFLA_VXLAN_L3MISS]))
3612                         conf->flags |= VXLAN_F_L3MISS;
3613         }
3614
3615         if (data[IFLA_VXLAN_LIMIT]) {
3616                 if (changelink)
3617                         return -EOPNOTSUPP;
3618                 conf->addrmax = nla_get_u32(data[IFLA_VXLAN_LIMIT]);
3619         }
3620
3621         if (data[IFLA_VXLAN_COLLECT_METADATA]) {
3622                 if (changelink)
3623                         return -EOPNOTSUPP;
3624                 if (nla_get_u8(data[IFLA_VXLAN_COLLECT_METADATA]))
3625                         conf->flags |= VXLAN_F_COLLECT_METADATA;
3626         }
3627
3628         if (data[IFLA_VXLAN_PORT_RANGE]) {
3629                 if (!changelink) {
3630                         const struct ifla_vxlan_port_range *p
3631                                 = nla_data(data[IFLA_VXLAN_PORT_RANGE]);
3632                         conf->port_min = ntohs(p->low);
3633                         conf->port_max = ntohs(p->high);
3634                 } else {
3635                         return -EOPNOTSUPP;
3636                 }
3637         }
3638
3639         if (data[IFLA_VXLAN_PORT]) {
3640                 if (changelink)
3641                         return -EOPNOTSUPP;
3642                 conf->dst_port = nla_get_be16(data[IFLA_VXLAN_PORT]);
3643         }
3644
3645         if (data[IFLA_VXLAN_UDP_CSUM]) {
3646                 if (changelink)
3647                         return -EOPNOTSUPP;
3648                 if (!nla_get_u8(data[IFLA_VXLAN_UDP_CSUM]))
3649                         conf->flags |= VXLAN_F_UDP_ZERO_CSUM_TX;
3650         }
3651
3652         if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX]) {
3653                 if (changelink)
3654                         return -EOPNOTSUPP;
3655                 if (nla_get_u8(data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX]))
3656                         conf->flags |= VXLAN_F_UDP_ZERO_CSUM6_TX;
3657         }
3658
3659         if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_RX]) {
3660                 if (changelink)
3661                         return -EOPNOTSUPP;
3662                 if (nla_get_u8(data[IFLA_VXLAN_UDP_ZERO_CSUM6_RX]))
3663                         conf->flags |= VXLAN_F_UDP_ZERO_CSUM6_RX;
3664         }
3665
3666         if (data[IFLA_VXLAN_REMCSUM_TX]) {
3667                 if (changelink)
3668                         return -EOPNOTSUPP;
3669                 if (nla_get_u8(data[IFLA_VXLAN_REMCSUM_TX]))
3670                         conf->flags |= VXLAN_F_REMCSUM_TX;
3671         }
3672
3673         if (data[IFLA_VXLAN_REMCSUM_RX]) {
3674                 if (changelink)
3675                         return -EOPNOTSUPP;
3676                 if (nla_get_u8(data[IFLA_VXLAN_REMCSUM_RX]))
3677                         conf->flags |= VXLAN_F_REMCSUM_RX;
3678         }
3679
3680         if (data[IFLA_VXLAN_GBP]) {
3681                 if (changelink)
3682                         return -EOPNOTSUPP;
3683                 conf->flags |= VXLAN_F_GBP;
3684         }
3685
3686         if (data[IFLA_VXLAN_GPE]) {
3687                 if (changelink)
3688                         return -EOPNOTSUPP;
3689                 conf->flags |= VXLAN_F_GPE;
3690         }
3691
3692         if (data[IFLA_VXLAN_REMCSUM_NOPARTIAL]) {
3693                 if (changelink)
3694                         return -EOPNOTSUPP;
3695                 conf->flags |= VXLAN_F_REMCSUM_NOPARTIAL;
3696         }
3697
3698         if (tb[IFLA_MTU]) {
3699                 if (changelink)
3700                         return -EOPNOTSUPP;
3701                 conf->mtu = nla_get_u32(tb[IFLA_MTU]);
3702         }
3703
3704         if (data[IFLA_VXLAN_DF])
3705                 conf->df = nla_get_u8(data[IFLA_VXLAN_DF]);
3706
3707         return 0;
3708 }
3709
3710 static int vxlan_newlink(struct net *src_net, struct net_device *dev,
3711                          struct nlattr *tb[], struct nlattr *data[],
3712                          struct netlink_ext_ack *extack)
3713 {
3714         struct vxlan_config conf;
3715         int err;
3716
3717         err = vxlan_nl2conf(tb, data, dev, &conf, false);
3718         if (err)
3719                 return err;
3720
3721         return __vxlan_dev_create(src_net, dev, &conf, extack);
3722 }
3723
3724 static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[],
3725                             struct nlattr *data[],
3726                             struct netlink_ext_ack *extack)
3727 {
3728         struct vxlan_dev *vxlan = netdev_priv(dev);
3729         struct vxlan_rdst *dst = &vxlan->default_dst;
3730         unsigned long old_age_interval;
3731         struct vxlan_rdst old_dst;
3732         struct vxlan_config conf;
3733         int err;
3734
3735         err = vxlan_nl2conf(tb, data,
3736                             dev, &conf, true);
3737         if (err)
3738                 return err;
3739
3740         old_age_interval = vxlan->cfg.age_interval;
3741         memcpy(&old_dst, dst, sizeof(struct vxlan_rdst));
3742
3743         err = vxlan_dev_configure(vxlan->net, dev, &conf, true, extack);
3744         if (err)
3745                 return err;
3746
3747         if (old_age_interval != vxlan->cfg.age_interval)
3748                 mod_timer(&vxlan->age_timer, jiffies);
3749
3750         /* handle default dst entry */
3751         if (!vxlan_addr_equal(&dst->remote_ip, &old_dst.remote_ip)) {
3752                 spin_lock_bh(&vxlan->hash_lock);
3753                 if (!vxlan_addr_any(&old_dst.remote_ip))
3754                         __vxlan_fdb_delete(vxlan, all_zeros_mac,
3755                                            old_dst.remote_ip,
3756                                            vxlan->cfg.dst_port,
3757                                            old_dst.remote_vni,
3758                                            old_dst.remote_vni,
3759                                            old_dst.remote_ifindex,
3760                                            true);
3761
3762                 if (!vxlan_addr_any(&dst->remote_ip)) {
3763                         err = vxlan_fdb_update(vxlan, all_zeros_mac,
3764                                                &dst->remote_ip,
3765                                                NUD_REACHABLE | NUD_PERMANENT,
3766                                                NLM_F_APPEND | NLM_F_CREATE,
3767                                                vxlan->cfg.dst_port,
3768                                                dst->remote_vni,
3769                                                dst->remote_vni,
3770                                                dst->remote_ifindex,
3771                                                NTF_SELF, true);
3772                         if (err) {
3773                                 spin_unlock_bh(&vxlan->hash_lock);
3774                                 return err;
3775                         }
3776                 }
3777                 spin_unlock_bh(&vxlan->hash_lock);
3778         }
3779
3780         return 0;
3781 }
3782
3783 static void vxlan_dellink(struct net_device *dev, struct list_head *head)
3784 {
3785         struct vxlan_dev *vxlan = netdev_priv(dev);
3786
3787         vxlan_flush(vxlan, true);
3788
3789         gro_cells_destroy(&vxlan->gro_cells);
3790         list_del(&vxlan->next);
3791         unregister_netdevice_queue(dev, head);
3792 }
3793
3794 static size_t vxlan_get_size(const struct net_device *dev)
3795 {
3796
3797         return nla_total_size(sizeof(__u32)) +  /* IFLA_VXLAN_ID */
3798                 nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_GROUP{6} */
3799                 nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LINK */
3800                 nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_LOCAL{6} */
3801                 nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_TTL */
3802                 nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_TTL_INHERIT */
3803                 nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_TOS */
3804                 nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_DF */
3805                 nla_total_size(sizeof(__be32)) + /* IFLA_VXLAN_LABEL */
3806                 nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_LEARNING */
3807                 nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_PROXY */
3808                 nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_RSC */
3809                 nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_L2MISS */
3810                 nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_L3MISS */
3811                 nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_COLLECT_METADATA */
3812                 nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_AGEING */
3813                 nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LIMIT */
3814                 nla_total_size(sizeof(struct ifla_vxlan_port_range)) +
3815                 nla_total_size(sizeof(__be16)) + /* IFLA_VXLAN_PORT */
3816                 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_CSUM */
3817                 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_TX */
3818                 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_RX */
3819                 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_TX */
3820                 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_RX */
3821                 0;
3822 }
3823
3824 static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
3825 {
3826         const struct vxlan_dev *vxlan = netdev_priv(dev);
3827         const struct vxlan_rdst *dst = &vxlan->default_dst;
3828         struct ifla_vxlan_port_range ports = {
3829                 .low =  htons(vxlan->cfg.port_min),
3830                 .high = htons(vxlan->cfg.port_max),
3831         };
3832
3833         if (nla_put_u32(skb, IFLA_VXLAN_ID, be32_to_cpu(dst->remote_vni)))
3834                 goto nla_put_failure;
3835
3836         if (!vxlan_addr_any(&dst->remote_ip)) {
3837                 if (dst->remote_ip.sa.sa_family == AF_INET) {
3838                         if (nla_put_in_addr(skb, IFLA_VXLAN_GROUP,
3839                                             dst->remote_ip.sin.sin_addr.s_addr))
3840                                 goto nla_put_failure;
3841 #if IS_ENABLED(CONFIG_IPV6)
3842                 } else {
3843                         if (nla_put_in6_addr(skb, IFLA_VXLAN_GROUP6,
3844                                              &dst->remote_ip.sin6.sin6_addr))
3845                                 goto nla_put_failure;
3846 #endif
3847                 }
3848         }
3849
3850         if (dst->remote_ifindex && nla_put_u32(skb, IFLA_VXLAN_LINK, dst->remote_ifindex))
3851                 goto nla_put_failure;
3852
3853         if (!vxlan_addr_any(&vxlan->cfg.saddr)) {
3854                 if (vxlan->cfg.saddr.sa.sa_family == AF_INET) {
3855                         if (nla_put_in_addr(skb, IFLA_VXLAN_LOCAL,
3856                                             vxlan->cfg.saddr.sin.sin_addr.s_addr))
3857                                 goto nla_put_failure;
3858 #if IS_ENABLED(CONFIG_IPV6)
3859                 } else {
3860                         if (nla_put_in6_addr(skb, IFLA_VXLAN_LOCAL6,
3861                                              &vxlan->cfg.saddr.sin6.sin6_addr))
3862                                 goto nla_put_failure;
3863 #endif
3864                 }
3865         }
3866
3867         if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->cfg.ttl) ||
3868             nla_put_u8(skb, IFLA_VXLAN_TTL_INHERIT,
3869                        !!(vxlan->cfg.flags & VXLAN_F_TTL_INHERIT)) ||
3870             nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->cfg.tos) ||
3871             nla_put_u8(skb, IFLA_VXLAN_DF, vxlan->cfg.df) ||
3872             nla_put_be32(skb, IFLA_VXLAN_LABEL, vxlan->cfg.label) ||
3873             nla_put_u8(skb, IFLA_VXLAN_LEARNING,
3874                         !!(vxlan->cfg.flags & VXLAN_F_LEARN)) ||
3875             nla_put_u8(skb, IFLA_VXLAN_PROXY,
3876                         !!(vxlan->cfg.flags & VXLAN_F_PROXY)) ||
3877             nla_put_u8(skb, IFLA_VXLAN_RSC,
3878                        !!(vxlan->cfg.flags & VXLAN_F_RSC)) ||
3879             nla_put_u8(skb, IFLA_VXLAN_L2MISS,
3880                         !!(vxlan->cfg.flags & VXLAN_F_L2MISS)) ||
3881             nla_put_u8(skb, IFLA_VXLAN_L3MISS,
3882                         !!(vxlan->cfg.flags & VXLAN_F_L3MISS)) ||
3883             nla_put_u8(skb, IFLA_VXLAN_COLLECT_METADATA,
3884                        !!(vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA)) ||
3885             nla_put_u32(skb, IFLA_VXLAN_AGEING, vxlan->cfg.age_interval) ||
3886             nla_put_u32(skb, IFLA_VXLAN_LIMIT, vxlan->cfg.addrmax) ||
3887             nla_put_be16(skb, IFLA_VXLAN_PORT, vxlan->cfg.dst_port) ||
3888             nla_put_u8(skb, IFLA_VXLAN_UDP_CSUM,
3889                         !(vxlan->cfg.flags & VXLAN_F_UDP_ZERO_CSUM_TX)) ||
3890             nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_TX,
3891                         !!(vxlan->cfg.flags & VXLAN_F_UDP_ZERO_CSUM6_TX)) ||
3892             nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_RX,
3893                         !!(vxlan->cfg.flags & VXLAN_F_UDP_ZERO_CSUM6_RX)) ||
3894             nla_put_u8(skb, IFLA_VXLAN_REMCSUM_TX,
3895                         !!(vxlan->cfg.flags & VXLAN_F_REMCSUM_TX)) ||
3896             nla_put_u8(skb, IFLA_VXLAN_REMCSUM_RX,
3897                         !!(vxlan->cfg.flags & VXLAN_F_REMCSUM_RX)))
3898                 goto nla_put_failure;
3899
3900         if (nla_put(skb, IFLA_VXLAN_PORT_RANGE, sizeof(ports), &ports))
3901                 goto nla_put_failure;
3902
3903         if (vxlan->cfg.flags & VXLAN_F_GBP &&
3904             nla_put_flag(skb, IFLA_VXLAN_GBP))
3905                 goto nla_put_failure;
3906
3907         if (vxlan->cfg.flags & VXLAN_F_GPE &&
3908             nla_put_flag(skb, IFLA_VXLAN_GPE))
3909                 goto nla_put_failure;
3910
3911         if (vxlan->cfg.flags & VXLAN_F_REMCSUM_NOPARTIAL &&
3912             nla_put_flag(skb, IFLA_VXLAN_REMCSUM_NOPARTIAL))
3913                 goto nla_put_failure;
3914
3915         return 0;
3916
3917 nla_put_failure:
3918         return -EMSGSIZE;
3919 }
3920
3921 static struct net *vxlan_get_link_net(const struct net_device *dev)
3922 {
3923         struct vxlan_dev *vxlan = netdev_priv(dev);
3924
3925         return vxlan->net;
3926 }
3927
3928 static struct rtnl_link_ops vxlan_link_ops __read_mostly = {
3929         .kind           = "vxlan",
3930         .maxtype        = IFLA_VXLAN_MAX,
3931         .policy         = vxlan_policy,
3932         .priv_size      = sizeof(struct vxlan_dev),
3933         .setup          = vxlan_setup,
3934         .validate       = vxlan_validate,
3935         .newlink        = vxlan_newlink,
3936         .changelink     = vxlan_changelink,
3937         .dellink        = vxlan_dellink,
3938         .get_size       = vxlan_get_size,
3939         .fill_info      = vxlan_fill_info,
3940         .get_link_net   = vxlan_get_link_net,
3941 };
3942
3943 struct net_device *vxlan_dev_create(struct net *net, const char *name,
3944                                     u8 name_assign_type,
3945                                     struct vxlan_config *conf)
3946 {
3947         struct nlattr *tb[IFLA_MAX + 1];
3948         struct net_device *dev;
3949         int err;
3950
3951         memset(&tb, 0, sizeof(tb));
3952
3953         dev = rtnl_create_link(net, name, name_assign_type,
3954                                &vxlan_link_ops, tb, NULL);
3955         if (IS_ERR(dev))
3956                 return dev;
3957
3958         err = __vxlan_dev_create(net, dev, conf, NULL);
3959         if (err < 0) {
3960                 free_netdev(dev);
3961                 return ERR_PTR(err);
3962         }
3963
3964         err = rtnl_configure_link(dev, NULL);
3965         if (err < 0) {
3966                 LIST_HEAD(list_kill);
3967
3968                 vxlan_dellink(dev, &list_kill);
3969                 unregister_netdevice_many(&list_kill);
3970                 return ERR_PTR(err);
3971         }
3972
3973         return dev;
3974 }
3975 EXPORT_SYMBOL_GPL(vxlan_dev_create);
3976
3977 static void vxlan_handle_lowerdev_unregister(struct vxlan_net *vn,
3978                                              struct net_device *dev)
3979 {
3980         struct vxlan_dev *vxlan, *next;
3981         LIST_HEAD(list_kill);
3982
3983         list_for_each_entry_safe(vxlan, next, &vn->vxlan_list, next) {
3984                 struct vxlan_rdst *dst = &vxlan->default_dst;
3985
3986                 /* In case we created vxlan device with carrier
3987                  * and we loose the carrier due to module unload
3988                  * we also need to remove vxlan device. In other
3989                  * cases, it's not necessary and remote_ifindex
3990                  * is 0 here, so no matches.
3991                  */
3992                 if (dst->remote_ifindex == dev->ifindex)
3993                         vxlan_dellink(vxlan->dev, &list_kill);
3994         }
3995
3996         unregister_netdevice_many(&list_kill);
3997 }
3998
3999 static int vxlan_netdevice_event(struct notifier_block *unused,
4000                                  unsigned long event, void *ptr)
4001 {
4002         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4003         struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
4004
4005         if (event == NETDEV_UNREGISTER) {
4006                 vxlan_offload_rx_ports(dev, false);
4007                 vxlan_handle_lowerdev_unregister(vn, dev);
4008         } else if (event == NETDEV_REGISTER) {
4009                 vxlan_offload_rx_ports(dev, true);
4010         } else if (event == NETDEV_UDP_TUNNEL_PUSH_INFO ||
4011                    event == NETDEV_UDP_TUNNEL_DROP_INFO) {
4012                 vxlan_offload_rx_ports(dev, event == NETDEV_UDP_TUNNEL_PUSH_INFO);
4013         }
4014
4015         return NOTIFY_DONE;
4016 }
4017
4018 static struct notifier_block vxlan_notifier_block __read_mostly = {
4019         .notifier_call = vxlan_netdevice_event,
4020 };
4021
4022 static void
4023 vxlan_fdb_offloaded_set(struct net_device *dev,
4024                         struct switchdev_notifier_vxlan_fdb_info *fdb_info)
4025 {
4026         struct vxlan_dev *vxlan = netdev_priv(dev);
4027         struct vxlan_rdst *rdst;
4028         struct vxlan_fdb *f;
4029
4030         spin_lock_bh(&vxlan->hash_lock);
4031
4032         f = vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni);
4033         if (!f)
4034                 goto out;
4035
4036         rdst = vxlan_fdb_find_rdst(f, &fdb_info->remote_ip,
4037                                    fdb_info->remote_port,
4038                                    fdb_info->remote_vni,
4039                                    fdb_info->remote_ifindex);
4040         if (!rdst)
4041                 goto out;
4042
4043         rdst->offloaded = fdb_info->offloaded;
4044
4045 out:
4046         spin_unlock_bh(&vxlan->hash_lock);
4047 }
4048
4049 static int
4050 vxlan_fdb_external_learn_add(struct net_device *dev,
4051                              struct switchdev_notifier_vxlan_fdb_info *fdb_info)
4052 {
4053         struct vxlan_dev *vxlan = netdev_priv(dev);
4054         int err;
4055
4056         spin_lock_bh(&vxlan->hash_lock);
4057         err = vxlan_fdb_update(vxlan, fdb_info->eth_addr, &fdb_info->remote_ip,
4058                                NUD_REACHABLE,
4059                                NLM_F_CREATE | NLM_F_REPLACE,
4060                                fdb_info->remote_port,
4061                                fdb_info->vni,
4062                                fdb_info->remote_vni,
4063                                fdb_info->remote_ifindex,
4064                                NTF_USE | NTF_SELF | NTF_EXT_LEARNED,
4065                                false);
4066         spin_unlock_bh(&vxlan->hash_lock);
4067
4068         return err;
4069 }
4070
4071 static int
4072 vxlan_fdb_external_learn_del(struct net_device *dev,
4073                              struct switchdev_notifier_vxlan_fdb_info *fdb_info)
4074 {
4075         struct vxlan_dev *vxlan = netdev_priv(dev);
4076         struct vxlan_fdb *f;
4077         int err = 0;
4078
4079         spin_lock_bh(&vxlan->hash_lock);
4080
4081         f = vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni);
4082         if (!f)
4083                 err = -ENOENT;
4084         else if (f->flags & NTF_EXT_LEARNED)
4085                 err = __vxlan_fdb_delete(vxlan, fdb_info->eth_addr,
4086                                          fdb_info->remote_ip,
4087                                          fdb_info->remote_port,
4088                                          fdb_info->vni,
4089                                          fdb_info->remote_vni,
4090                                          fdb_info->remote_ifindex,
4091                                          false);
4092
4093         spin_unlock_bh(&vxlan->hash_lock);
4094
4095         return err;
4096 }
4097
4098 static int vxlan_switchdev_event(struct notifier_block *unused,
4099                                  unsigned long event, void *ptr)
4100 {
4101         struct net_device *dev = switchdev_notifier_info_to_dev(ptr);
4102         struct switchdev_notifier_vxlan_fdb_info *fdb_info;
4103         int err = 0;
4104
4105         switch (event) {
4106         case SWITCHDEV_VXLAN_FDB_OFFLOADED:
4107                 vxlan_fdb_offloaded_set(dev, ptr);
4108                 break;
4109         case SWITCHDEV_VXLAN_FDB_ADD_TO_BRIDGE:
4110                 fdb_info = ptr;
4111                 err = vxlan_fdb_external_learn_add(dev, fdb_info);
4112                 if (err) {
4113                         err = notifier_from_errno(err);
4114                         break;
4115                 }
4116                 fdb_info->offloaded = true;
4117                 vxlan_fdb_offloaded_set(dev, fdb_info);
4118                 break;
4119         case SWITCHDEV_VXLAN_FDB_DEL_TO_BRIDGE:
4120                 fdb_info = ptr;
4121                 err = vxlan_fdb_external_learn_del(dev, fdb_info);
4122                 if (err) {
4123                         err = notifier_from_errno(err);
4124                         break;
4125                 }
4126                 fdb_info->offloaded = false;
4127                 vxlan_fdb_offloaded_set(dev, fdb_info);
4128                 break;
4129         }
4130
4131         return err;
4132 }
4133
4134 static struct notifier_block vxlan_switchdev_notifier_block __read_mostly = {
4135         .notifier_call = vxlan_switchdev_event,
4136 };
4137
4138 static __net_init int vxlan_init_net(struct net *net)
4139 {
4140         struct vxlan_net *vn = net_generic(net, vxlan_net_id);
4141         unsigned int h;
4142
4143         INIT_LIST_HEAD(&vn->vxlan_list);
4144         spin_lock_init(&vn->sock_lock);
4145
4146         for (h = 0; h < PORT_HASH_SIZE; ++h)
4147                 INIT_HLIST_HEAD(&vn->sock_list[h]);
4148
4149         return 0;
4150 }
4151
4152 static void vxlan_destroy_tunnels(struct net *net, struct list_head *head)
4153 {
4154         struct vxlan_net *vn = net_generic(net, vxlan_net_id);
4155         struct vxlan_dev *vxlan, *next;
4156         struct net_device *dev, *aux;
4157         unsigned int h;
4158
4159         for_each_netdev_safe(net, dev, aux)
4160                 if (dev->rtnl_link_ops == &vxlan_link_ops)
4161                         unregister_netdevice_queue(dev, head);
4162
4163         list_for_each_entry_safe(vxlan, next, &vn->vxlan_list, next) {
4164                 /* If vxlan->dev is in the same netns, it has already been added
4165                  * to the list by the previous loop.
4166                  */
4167                 if (!net_eq(dev_net(vxlan->dev), net)) {
4168                         gro_cells_destroy(&vxlan->gro_cells);
4169                         unregister_netdevice_queue(vxlan->dev, head);
4170                 }
4171         }
4172
4173         for (h = 0; h < PORT_HASH_SIZE; ++h)
4174                 WARN_ON_ONCE(!hlist_empty(&vn->sock_list[h]));
4175 }
4176
4177 static void __net_exit vxlan_exit_batch_net(struct list_head *net_list)
4178 {
4179         struct net *net;
4180         LIST_HEAD(list);
4181
4182         rtnl_lock();
4183         list_for_each_entry(net, net_list, exit_list)
4184                 vxlan_destroy_tunnels(net, &list);
4185
4186         unregister_netdevice_many(&list);
4187         rtnl_unlock();
4188 }
4189
4190 static struct pernet_operations vxlan_net_ops = {
4191         .init = vxlan_init_net,
4192         .exit_batch = vxlan_exit_batch_net,
4193         .id   = &vxlan_net_id,
4194         .size = sizeof(struct vxlan_net),
4195 };
4196
4197 static int __init vxlan_init_module(void)
4198 {
4199         int rc;
4200
4201         get_random_bytes(&vxlan_salt, sizeof(vxlan_salt));
4202
4203         rc = register_pernet_subsys(&vxlan_net_ops);
4204         if (rc)
4205                 goto out1;
4206
4207         rc = register_netdevice_notifier(&vxlan_notifier_block);
4208         if (rc)
4209                 goto out2;
4210
4211         rc = register_switchdev_notifier(&vxlan_switchdev_notifier_block);
4212         if (rc)
4213                 goto out3;
4214
4215         rc = rtnl_link_register(&vxlan_link_ops);
4216         if (rc)
4217                 goto out4;
4218
4219         return 0;
4220 out4:
4221         unregister_switchdev_notifier(&vxlan_switchdev_notifier_block);
4222 out3:
4223         unregister_netdevice_notifier(&vxlan_notifier_block);
4224 out2:
4225         unregister_pernet_subsys(&vxlan_net_ops);
4226 out1:
4227         return rc;
4228 }
4229 late_initcall(vxlan_init_module);
4230
4231 static void __exit vxlan_cleanup_module(void)
4232 {
4233         rtnl_link_unregister(&vxlan_link_ops);
4234         unregister_switchdev_notifier(&vxlan_switchdev_notifier_block);
4235         unregister_netdevice_notifier(&vxlan_notifier_block);
4236         unregister_pernet_subsys(&vxlan_net_ops);
4237         /* rcu_barrier() is called by netns */
4238 }
4239 module_exit(vxlan_cleanup_module);
4240
4241 MODULE_LICENSE("GPL");
4242 MODULE_VERSION(VXLAN_VERSION);
4243 MODULE_AUTHOR("Stephen Hemminger <stephen@networkplumber.org>");
4244 MODULE_DESCRIPTION("Driver for VXLAN encapsulated traffic");
4245 MODULE_ALIAS_RTNL_LINK("vxlan");