OSDN Git Service

Merge tag 'clk-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git...
[uclinux-h8/linux.git] / net / ipv4 / ip_tunnel.c
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/init.h>
34 #include <linux/in6.h>
35 #include <linux/inetdevice.h>
36 #include <linux/igmp.h>
37 #include <linux/netfilter_ipv4.h>
38 #include <linux/etherdevice.h>
39 #include <linux/if_ether.h>
40 #include <linux/if_vlan.h>
41 #include <linux/rculist.h>
42 #include <linux/err.h>
43
44 #include <net/sock.h>
45 #include <net/ip.h>
46 #include <net/icmp.h>
47 #include <net/protocol.h>
48 #include <net/ip_tunnels.h>
49 #include <net/arp.h>
50 #include <net/checksum.h>
51 #include <net/dsfield.h>
52 #include <net/inet_ecn.h>
53 #include <net/xfrm.h>
54 #include <net/net_namespace.h>
55 #include <net/netns/generic.h>
56 #include <net/rtnetlink.h>
57 #include <net/udp.h>
58 #include <net/dst_metadata.h>
59
60 #if IS_ENABLED(CONFIG_IPV6)
61 #include <net/ipv6.h>
62 #include <net/ip6_fib.h>
63 #include <net/ip6_route.h>
64 #endif
65
66 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
67 {
68         return hash_32((__force u32)key ^ (__force u32)remote,
69                          IP_TNL_HASH_BITS);
70 }
71
72 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
73                                 __be16 flags, __be32 key)
74 {
75         if (p->i_flags & TUNNEL_KEY) {
76                 if (flags & TUNNEL_KEY)
77                         return key == p->i_key;
78                 else
79                         /* key expected, none present */
80                         return false;
81         } else
82                 return !(flags & TUNNEL_KEY);
83 }
84
85 /* Fallback tunnel: no source, no destination, no key, no options
86
87    Tunnel hash table:
88    We require exact key match i.e. if a key is present in packet
89    it will match only tunnel with the same key; if it is not present,
90    it will match only keyless tunnel.
91
92    All keysless packets, if not matched configured keyless tunnels
93    will match fallback tunnel.
94    Given src, dst and key, find appropriate for input tunnel.
95 */
96 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
97                                    int link, __be16 flags,
98                                    __be32 remote, __be32 local,
99                                    __be32 key)
100 {
101         unsigned int hash;
102         struct ip_tunnel *t, *cand = NULL;
103         struct hlist_head *head;
104
105         hash = ip_tunnel_hash(key, remote);
106         head = &itn->tunnels[hash];
107
108         hlist_for_each_entry_rcu(t, head, hash_node) {
109                 if (local != t->parms.iph.saddr ||
110                     remote != t->parms.iph.daddr ||
111                     !(t->dev->flags & IFF_UP))
112                         continue;
113
114                 if (!ip_tunnel_key_match(&t->parms, flags, key))
115                         continue;
116
117                 if (t->parms.link == link)
118                         return t;
119                 else
120                         cand = t;
121         }
122
123         hlist_for_each_entry_rcu(t, head, hash_node) {
124                 if (remote != t->parms.iph.daddr ||
125                     t->parms.iph.saddr != 0 ||
126                     !(t->dev->flags & IFF_UP))
127                         continue;
128
129                 if (!ip_tunnel_key_match(&t->parms, flags, key))
130                         continue;
131
132                 if (t->parms.link == link)
133                         return t;
134                 else if (!cand)
135                         cand = t;
136         }
137
138         hash = ip_tunnel_hash(key, 0);
139         head = &itn->tunnels[hash];
140
141         hlist_for_each_entry_rcu(t, head, hash_node) {
142                 if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
143                     (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
144                         continue;
145
146                 if (!(t->dev->flags & IFF_UP))
147                         continue;
148
149                 if (!ip_tunnel_key_match(&t->parms, flags, key))
150                         continue;
151
152                 if (t->parms.link == link)
153                         return t;
154                 else if (!cand)
155                         cand = t;
156         }
157
158         if (flags & TUNNEL_NO_KEY)
159                 goto skip_key_lookup;
160
161         hlist_for_each_entry_rcu(t, head, hash_node) {
162                 if (t->parms.i_key != key ||
163                     t->parms.iph.saddr != 0 ||
164                     t->parms.iph.daddr != 0 ||
165                     !(t->dev->flags & IFF_UP))
166                         continue;
167
168                 if (t->parms.link == link)
169                         return t;
170                 else if (!cand)
171                         cand = t;
172         }
173
174 skip_key_lookup:
175         if (cand)
176                 return cand;
177
178         t = rcu_dereference(itn->collect_md_tun);
179         if (t && t->dev->flags & IFF_UP)
180                 return t;
181
182         if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
183                 return netdev_priv(itn->fb_tunnel_dev);
184
185         return NULL;
186 }
187 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
188
189 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
190                                     struct ip_tunnel_parm *parms)
191 {
192         unsigned int h;
193         __be32 remote;
194         __be32 i_key = parms->i_key;
195
196         if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
197                 remote = parms->iph.daddr;
198         else
199                 remote = 0;
200
201         if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
202                 i_key = 0;
203
204         h = ip_tunnel_hash(i_key, remote);
205         return &itn->tunnels[h];
206 }
207
208 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
209 {
210         struct hlist_head *head = ip_bucket(itn, &t->parms);
211
212         if (t->collect_md)
213                 rcu_assign_pointer(itn->collect_md_tun, t);
214         hlist_add_head_rcu(&t->hash_node, head);
215 }
216
217 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
218 {
219         if (t->collect_md)
220                 rcu_assign_pointer(itn->collect_md_tun, NULL);
221         hlist_del_init_rcu(&t->hash_node);
222 }
223
224 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
225                                         struct ip_tunnel_parm *parms,
226                                         int type)
227 {
228         __be32 remote = parms->iph.daddr;
229         __be32 local = parms->iph.saddr;
230         __be32 key = parms->i_key;
231         __be16 flags = parms->i_flags;
232         int link = parms->link;
233         struct ip_tunnel *t = NULL;
234         struct hlist_head *head = ip_bucket(itn, parms);
235
236         hlist_for_each_entry_rcu(t, head, hash_node) {
237                 if (local == t->parms.iph.saddr &&
238                     remote == t->parms.iph.daddr &&
239                     link == t->parms.link &&
240                     type == t->dev->type &&
241                     ip_tunnel_key_match(&t->parms, flags, key))
242                         break;
243         }
244         return t;
245 }
246
247 static struct net_device *__ip_tunnel_create(struct net *net,
248                                              const struct rtnl_link_ops *ops,
249                                              struct ip_tunnel_parm *parms)
250 {
251         int err;
252         struct ip_tunnel *tunnel;
253         struct net_device *dev;
254         char name[IFNAMSIZ];
255
256         err = -E2BIG;
257         if (parms->name[0]) {
258                 if (!dev_valid_name(parms->name))
259                         goto failed;
260                 strlcpy(name, parms->name, IFNAMSIZ);
261         } else {
262                 if (strlen(ops->kind) > (IFNAMSIZ - 3))
263                         goto failed;
264                 strcpy(name, ops->kind);
265                 strcat(name, "%d");
266         }
267
268         ASSERT_RTNL();
269         dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
270         if (!dev) {
271                 err = -ENOMEM;
272                 goto failed;
273         }
274         dev_net_set(dev, net);
275
276         dev->rtnl_link_ops = ops;
277
278         tunnel = netdev_priv(dev);
279         tunnel->parms = *parms;
280         tunnel->net = net;
281
282         err = register_netdevice(dev);
283         if (err)
284                 goto failed_free;
285
286         return dev;
287
288 failed_free:
289         free_netdev(dev);
290 failed:
291         return ERR_PTR(err);
292 }
293
294 static int ip_tunnel_bind_dev(struct net_device *dev)
295 {
296         struct net_device *tdev = NULL;
297         struct ip_tunnel *tunnel = netdev_priv(dev);
298         const struct iphdr *iph;
299         int hlen = LL_MAX_HEADER;
300         int mtu = ETH_DATA_LEN;
301         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
302
303         iph = &tunnel->parms.iph;
304
305         /* Guess output device to choose reasonable mtu and needed_headroom */
306         if (iph->daddr) {
307                 struct flowi4 fl4;
308                 struct rtable *rt;
309
310                 ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
311                                     iph->saddr, tunnel->parms.o_key,
312                                     RT_TOS(iph->tos), tunnel->parms.link,
313                                     tunnel->fwmark);
314                 rt = ip_route_output_key(tunnel->net, &fl4);
315
316                 if (!IS_ERR(rt)) {
317                         tdev = rt->dst.dev;
318                         ip_rt_put(rt);
319                 }
320                 if (dev->type != ARPHRD_ETHER)
321                         dev->flags |= IFF_POINTOPOINT;
322
323                 dst_cache_reset(&tunnel->dst_cache);
324         }
325
326         if (!tdev && tunnel->parms.link)
327                 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
328
329         if (tdev) {
330                 hlen = tdev->hard_header_len + tdev->needed_headroom;
331                 mtu = min(tdev->mtu, IP_MAX_MTU);
332         }
333
334         dev->needed_headroom = t_hlen + hlen;
335         mtu -= (dev->hard_header_len + t_hlen);
336
337         if (mtu < IPV4_MIN_MTU)
338                 mtu = IPV4_MIN_MTU;
339
340         return mtu;
341 }
342
343 static struct ip_tunnel *ip_tunnel_create(struct net *net,
344                                           struct ip_tunnel_net *itn,
345                                           struct ip_tunnel_parm *parms)
346 {
347         struct ip_tunnel *nt;
348         struct net_device *dev;
349         int t_hlen;
350         int mtu;
351         int err;
352
353         dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms);
354         if (IS_ERR(dev))
355                 return ERR_CAST(dev);
356
357         mtu = ip_tunnel_bind_dev(dev);
358         err = dev_set_mtu(dev, mtu);
359         if (err)
360                 goto err_dev_set_mtu;
361
362         nt = netdev_priv(dev);
363         t_hlen = nt->hlen + sizeof(struct iphdr);
364         dev->min_mtu = ETH_MIN_MTU;
365         dev->max_mtu = IP_MAX_MTU - dev->hard_header_len - t_hlen;
366         ip_tunnel_add(itn, nt);
367         return nt;
368
369 err_dev_set_mtu:
370         unregister_netdevice(dev);
371         return ERR_PTR(err);
372 }
373
374 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
375                   const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
376                   bool log_ecn_error)
377 {
378         struct pcpu_sw_netstats *tstats;
379         const struct iphdr *iph = ip_hdr(skb);
380         int err;
381
382 #ifdef CONFIG_NET_IPGRE_BROADCAST
383         if (ipv4_is_multicast(iph->daddr)) {
384                 tunnel->dev->stats.multicast++;
385                 skb->pkt_type = PACKET_BROADCAST;
386         }
387 #endif
388
389         if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
390              ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
391                 tunnel->dev->stats.rx_crc_errors++;
392                 tunnel->dev->stats.rx_errors++;
393                 goto drop;
394         }
395
396         if (tunnel->parms.i_flags&TUNNEL_SEQ) {
397                 if (!(tpi->flags&TUNNEL_SEQ) ||
398                     (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
399                         tunnel->dev->stats.rx_fifo_errors++;
400                         tunnel->dev->stats.rx_errors++;
401                         goto drop;
402                 }
403                 tunnel->i_seqno = ntohl(tpi->seq) + 1;
404         }
405
406         skb_reset_network_header(skb);
407
408         err = IP_ECN_decapsulate(iph, skb);
409         if (unlikely(err)) {
410                 if (log_ecn_error)
411                         net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
412                                         &iph->saddr, iph->tos);
413                 if (err > 1) {
414                         ++tunnel->dev->stats.rx_frame_errors;
415                         ++tunnel->dev->stats.rx_errors;
416                         goto drop;
417                 }
418         }
419
420         tstats = this_cpu_ptr(tunnel->dev->tstats);
421         u64_stats_update_begin(&tstats->syncp);
422         tstats->rx_packets++;
423         tstats->rx_bytes += skb->len;
424         u64_stats_update_end(&tstats->syncp);
425
426         skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
427
428         if (tunnel->dev->type == ARPHRD_ETHER) {
429                 skb->protocol = eth_type_trans(skb, tunnel->dev);
430                 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
431         } else {
432                 skb->dev = tunnel->dev;
433         }
434
435         if (tun_dst)
436                 skb_dst_set(skb, (struct dst_entry *)tun_dst);
437
438         gro_cells_receive(&tunnel->gro_cells, skb);
439         return 0;
440
441 drop:
442         if (tun_dst)
443                 dst_release((struct dst_entry *)tun_dst);
444         kfree_skb(skb);
445         return 0;
446 }
447 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
448
449 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
450                             unsigned int num)
451 {
452         if (num >= MAX_IPTUN_ENCAP_OPS)
453                 return -ERANGE;
454
455         return !cmpxchg((const struct ip_tunnel_encap_ops **)
456                         &iptun_encaps[num],
457                         NULL, ops) ? 0 : -1;
458 }
459 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
460
461 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
462                             unsigned int num)
463 {
464         int ret;
465
466         if (num >= MAX_IPTUN_ENCAP_OPS)
467                 return -ERANGE;
468
469         ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
470                        &iptun_encaps[num],
471                        ops, NULL) == ops) ? 0 : -1;
472
473         synchronize_net();
474
475         return ret;
476 }
477 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
478
479 int ip_tunnel_encap_setup(struct ip_tunnel *t,
480                           struct ip_tunnel_encap *ipencap)
481 {
482         int hlen;
483
484         memset(&t->encap, 0, sizeof(t->encap));
485
486         hlen = ip_encap_hlen(ipencap);
487         if (hlen < 0)
488                 return hlen;
489
490         t->encap.type = ipencap->type;
491         t->encap.sport = ipencap->sport;
492         t->encap.dport = ipencap->dport;
493         t->encap.flags = ipencap->flags;
494
495         t->encap_hlen = hlen;
496         t->hlen = t->encap_hlen + t->tun_hlen;
497
498         return 0;
499 }
500 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
501
502 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
503                             struct rtable *rt, __be16 df,
504                             const struct iphdr *inner_iph)
505 {
506         struct ip_tunnel *tunnel = netdev_priv(dev);
507         int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
508         int mtu;
509
510         if (df)
511                 mtu = dst_mtu(&rt->dst) - dev->hard_header_len
512                                         - sizeof(struct iphdr) - tunnel->hlen;
513         else
514                 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
515
516         skb_dst_update_pmtu(skb, mtu);
517
518         if (skb->protocol == htons(ETH_P_IP)) {
519                 if (!skb_is_gso(skb) &&
520                     (inner_iph->frag_off & htons(IP_DF)) &&
521                     mtu < pkt_size) {
522                         memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
523                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
524                         return -E2BIG;
525                 }
526         }
527 #if IS_ENABLED(CONFIG_IPV6)
528         else if (skb->protocol == htons(ETH_P_IPV6)) {
529                 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
530
531                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
532                            mtu >= IPV6_MIN_MTU) {
533                         if ((tunnel->parms.iph.daddr &&
534                             !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
535                             rt6->rt6i_dst.plen == 128) {
536                                 rt6->rt6i_flags |= RTF_MODIFIED;
537                                 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
538                         }
539                 }
540
541                 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
542                                         mtu < pkt_size) {
543                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
544                         return -E2BIG;
545                 }
546         }
547 #endif
548         return 0;
549 }
550
551 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto)
552 {
553         struct ip_tunnel *tunnel = netdev_priv(dev);
554         u32 headroom = sizeof(struct iphdr);
555         struct ip_tunnel_info *tun_info;
556         const struct ip_tunnel_key *key;
557         const struct iphdr *inner_iph;
558         struct rtable *rt;
559         struct flowi4 fl4;
560         __be16 df = 0;
561         u8 tos, ttl;
562
563         tun_info = skb_tunnel_info(skb);
564         if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
565                      ip_tunnel_info_af(tun_info) != AF_INET))
566                 goto tx_error;
567         key = &tun_info->key;
568         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
569         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
570         tos = key->tos;
571         if (tos == 1) {
572                 if (skb->protocol == htons(ETH_P_IP))
573                         tos = inner_iph->tos;
574                 else if (skb->protocol == htons(ETH_P_IPV6))
575                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
576         }
577         ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 0,
578                             RT_TOS(tos), tunnel->parms.link, tunnel->fwmark);
579         if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
580                 goto tx_error;
581         rt = ip_route_output_key(tunnel->net, &fl4);
582         if (IS_ERR(rt)) {
583                 dev->stats.tx_carrier_errors++;
584                 goto tx_error;
585         }
586         if (rt->dst.dev == dev) {
587                 ip_rt_put(rt);
588                 dev->stats.collisions++;
589                 goto tx_error;
590         }
591         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
592         ttl = key->ttl;
593         if (ttl == 0) {
594                 if (skb->protocol == htons(ETH_P_IP))
595                         ttl = inner_iph->ttl;
596                 else if (skb->protocol == htons(ETH_P_IPV6))
597                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
598                 else
599                         ttl = ip4_dst_hoplimit(&rt->dst);
600         }
601         if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
602                 df = htons(IP_DF);
603         else if (skb->protocol == htons(ETH_P_IP))
604                 df = inner_iph->frag_off & htons(IP_DF);
605         headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
606         if (headroom > dev->needed_headroom)
607                 dev->needed_headroom = headroom;
608
609         if (skb_cow_head(skb, dev->needed_headroom)) {
610                 ip_rt_put(rt);
611                 goto tx_dropped;
612         }
613         iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
614                       df, !net_eq(tunnel->net, dev_net(dev)));
615         return;
616 tx_error:
617         dev->stats.tx_errors++;
618         goto kfree;
619 tx_dropped:
620         dev->stats.tx_dropped++;
621 kfree:
622         kfree_skb(skb);
623 }
624 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
625
626 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
627                     const struct iphdr *tnl_params, u8 protocol)
628 {
629         struct ip_tunnel *tunnel = netdev_priv(dev);
630         const struct iphdr *inner_iph;
631         struct flowi4 fl4;
632         u8     tos, ttl;
633         __be16 df;
634         struct rtable *rt;              /* Route to the other host */
635         unsigned int max_headroom;      /* The extra header space needed */
636         __be32 dst;
637         bool connected;
638
639         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
640         connected = (tunnel->parms.iph.daddr != 0);
641
642         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
643
644         dst = tnl_params->daddr;
645         if (dst == 0) {
646                 /* NBMA tunnel */
647                 struct ip_tunnel_info *tun_info;
648
649                 if (!skb_dst(skb)) {
650                         dev->stats.tx_fifo_errors++;
651                         goto tx_error;
652                 }
653
654                 tun_info = skb_tunnel_info(skb);
655                 if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) &&
656                     ip_tunnel_info_af(tun_info) == AF_INET &&
657                     tun_info->key.u.ipv4.dst)
658                         dst = tun_info->key.u.ipv4.dst;
659                 else if (skb->protocol == htons(ETH_P_IP)) {
660                         rt = skb_rtable(skb);
661                         dst = rt_nexthop(rt, inner_iph->daddr);
662                 }
663 #if IS_ENABLED(CONFIG_IPV6)
664                 else if (skb->protocol == htons(ETH_P_IPV6)) {
665                         const struct in6_addr *addr6;
666                         struct neighbour *neigh;
667                         bool do_tx_error_icmp;
668                         int addr_type;
669
670                         neigh = dst_neigh_lookup(skb_dst(skb),
671                                                  &ipv6_hdr(skb)->daddr);
672                         if (!neigh)
673                                 goto tx_error;
674
675                         addr6 = (const struct in6_addr *)&neigh->primary_key;
676                         addr_type = ipv6_addr_type(addr6);
677
678                         if (addr_type == IPV6_ADDR_ANY) {
679                                 addr6 = &ipv6_hdr(skb)->daddr;
680                                 addr_type = ipv6_addr_type(addr6);
681                         }
682
683                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
684                                 do_tx_error_icmp = true;
685                         else {
686                                 do_tx_error_icmp = false;
687                                 dst = addr6->s6_addr32[3];
688                         }
689                         neigh_release(neigh);
690                         if (do_tx_error_icmp)
691                                 goto tx_error_icmp;
692                 }
693 #endif
694                 else
695                         goto tx_error;
696
697                 connected = false;
698         }
699
700         tos = tnl_params->tos;
701         if (tos & 0x1) {
702                 tos &= ~0x1;
703                 if (skb->protocol == htons(ETH_P_IP)) {
704                         tos = inner_iph->tos;
705                         connected = false;
706                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
707                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
708                         connected = false;
709                 }
710         }
711
712         ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
713                             tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link,
714                             tunnel->fwmark);
715
716         if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
717                 goto tx_error;
718
719         rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr) :
720                          NULL;
721
722         if (!rt) {
723                 rt = ip_route_output_key(tunnel->net, &fl4);
724
725                 if (IS_ERR(rt)) {
726                         dev->stats.tx_carrier_errors++;
727                         goto tx_error;
728                 }
729                 if (connected)
730                         dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
731                                           fl4.saddr);
732         }
733
734         if (rt->dst.dev == dev) {
735                 ip_rt_put(rt);
736                 dev->stats.collisions++;
737                 goto tx_error;
738         }
739
740         if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph)) {
741                 ip_rt_put(rt);
742                 goto tx_error;
743         }
744
745         if (tunnel->err_count > 0) {
746                 if (time_before(jiffies,
747                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
748                         tunnel->err_count--;
749
750                         dst_link_failure(skb);
751                 } else
752                         tunnel->err_count = 0;
753         }
754
755         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
756         ttl = tnl_params->ttl;
757         if (ttl == 0) {
758                 if (skb->protocol == htons(ETH_P_IP))
759                         ttl = inner_iph->ttl;
760 #if IS_ENABLED(CONFIG_IPV6)
761                 else if (skb->protocol == htons(ETH_P_IPV6))
762                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
763 #endif
764                 else
765                         ttl = ip4_dst_hoplimit(&rt->dst);
766         }
767
768         df = tnl_params->frag_off;
769         if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
770                 df |= (inner_iph->frag_off&htons(IP_DF));
771
772         max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
773                         + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
774         if (max_headroom > dev->needed_headroom)
775                 dev->needed_headroom = max_headroom;
776
777         if (skb_cow_head(skb, dev->needed_headroom)) {
778                 ip_rt_put(rt);
779                 dev->stats.tx_dropped++;
780                 kfree_skb(skb);
781                 return;
782         }
783
784         iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
785                       df, !net_eq(tunnel->net, dev_net(dev)));
786         return;
787
788 #if IS_ENABLED(CONFIG_IPV6)
789 tx_error_icmp:
790         dst_link_failure(skb);
791 #endif
792 tx_error:
793         dev->stats.tx_errors++;
794         kfree_skb(skb);
795 }
796 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
797
798 static void ip_tunnel_update(struct ip_tunnel_net *itn,
799                              struct ip_tunnel *t,
800                              struct net_device *dev,
801                              struct ip_tunnel_parm *p,
802                              bool set_mtu,
803                              __u32 fwmark)
804 {
805         ip_tunnel_del(itn, t);
806         t->parms.iph.saddr = p->iph.saddr;
807         t->parms.iph.daddr = p->iph.daddr;
808         t->parms.i_key = p->i_key;
809         t->parms.o_key = p->o_key;
810         if (dev->type != ARPHRD_ETHER) {
811                 memcpy(dev->dev_addr, &p->iph.saddr, 4);
812                 memcpy(dev->broadcast, &p->iph.daddr, 4);
813         }
814         ip_tunnel_add(itn, t);
815
816         t->parms.iph.ttl = p->iph.ttl;
817         t->parms.iph.tos = p->iph.tos;
818         t->parms.iph.frag_off = p->iph.frag_off;
819
820         if (t->parms.link != p->link || t->fwmark != fwmark) {
821                 int mtu;
822
823                 t->parms.link = p->link;
824                 t->fwmark = fwmark;
825                 mtu = ip_tunnel_bind_dev(dev);
826                 if (set_mtu)
827                         dev->mtu = mtu;
828         }
829         dst_cache_reset(&t->dst_cache);
830         netdev_state_change(dev);
831 }
832
833 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
834 {
835         int err = 0;
836         struct ip_tunnel *t = netdev_priv(dev);
837         struct net *net = t->net;
838         struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
839
840         switch (cmd) {
841         case SIOCGETTUNNEL:
842                 if (dev == itn->fb_tunnel_dev) {
843                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
844                         if (!t)
845                                 t = netdev_priv(dev);
846                 }
847                 memcpy(p, &t->parms, sizeof(*p));
848                 break;
849
850         case SIOCADDTUNNEL:
851         case SIOCCHGTUNNEL:
852                 err = -EPERM;
853                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
854                         goto done;
855                 if (p->iph.ttl)
856                         p->iph.frag_off |= htons(IP_DF);
857                 if (!(p->i_flags & VTI_ISVTI)) {
858                         if (!(p->i_flags & TUNNEL_KEY))
859                                 p->i_key = 0;
860                         if (!(p->o_flags & TUNNEL_KEY))
861                                 p->o_key = 0;
862                 }
863
864                 t = ip_tunnel_find(itn, p, itn->type);
865
866                 if (cmd == SIOCADDTUNNEL) {
867                         if (!t) {
868                                 t = ip_tunnel_create(net, itn, p);
869                                 err = PTR_ERR_OR_ZERO(t);
870                                 break;
871                         }
872
873                         err = -EEXIST;
874                         break;
875                 }
876                 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
877                         if (t) {
878                                 if (t->dev != dev) {
879                                         err = -EEXIST;
880                                         break;
881                                 }
882                         } else {
883                                 unsigned int nflags = 0;
884
885                                 if (ipv4_is_multicast(p->iph.daddr))
886                                         nflags = IFF_BROADCAST;
887                                 else if (p->iph.daddr)
888                                         nflags = IFF_POINTOPOINT;
889
890                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
891                                         err = -EINVAL;
892                                         break;
893                                 }
894
895                                 t = netdev_priv(dev);
896                         }
897                 }
898
899                 if (t) {
900                         err = 0;
901                         ip_tunnel_update(itn, t, dev, p, true, 0);
902                 } else {
903                         err = -ENOENT;
904                 }
905                 break;
906
907         case SIOCDELTUNNEL:
908                 err = -EPERM;
909                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
910                         goto done;
911
912                 if (dev == itn->fb_tunnel_dev) {
913                         err = -ENOENT;
914                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
915                         if (!t)
916                                 goto done;
917                         err = -EPERM;
918                         if (t == netdev_priv(itn->fb_tunnel_dev))
919                                 goto done;
920                         dev = t->dev;
921                 }
922                 unregister_netdevice(dev);
923                 err = 0;
924                 break;
925
926         default:
927                 err = -EINVAL;
928         }
929
930 done:
931         return err;
932 }
933 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
934
935 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
936 {
937         struct ip_tunnel *tunnel = netdev_priv(dev);
938         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
939         int max_mtu = IP_MAX_MTU - dev->hard_header_len - t_hlen;
940
941         if (new_mtu < ETH_MIN_MTU)
942                 return -EINVAL;
943
944         if (new_mtu > max_mtu) {
945                 if (strict)
946                         return -EINVAL;
947
948                 new_mtu = max_mtu;
949         }
950
951         dev->mtu = new_mtu;
952         return 0;
953 }
954 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
955
956 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
957 {
958         return __ip_tunnel_change_mtu(dev, new_mtu, true);
959 }
960 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
961
962 static void ip_tunnel_dev_free(struct net_device *dev)
963 {
964         struct ip_tunnel *tunnel = netdev_priv(dev);
965
966         gro_cells_destroy(&tunnel->gro_cells);
967         dst_cache_destroy(&tunnel->dst_cache);
968         free_percpu(dev->tstats);
969 }
970
971 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
972 {
973         struct ip_tunnel *tunnel = netdev_priv(dev);
974         struct ip_tunnel_net *itn;
975
976         itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
977
978         if (itn->fb_tunnel_dev != dev) {
979                 ip_tunnel_del(itn, netdev_priv(dev));
980                 unregister_netdevice_queue(dev, head);
981         }
982 }
983 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
984
985 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
986 {
987         struct ip_tunnel *tunnel = netdev_priv(dev);
988
989         return tunnel->net;
990 }
991 EXPORT_SYMBOL(ip_tunnel_get_link_net);
992
993 int ip_tunnel_get_iflink(const struct net_device *dev)
994 {
995         struct ip_tunnel *tunnel = netdev_priv(dev);
996
997         return tunnel->parms.link;
998 }
999 EXPORT_SYMBOL(ip_tunnel_get_iflink);
1000
1001 int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
1002                                   struct rtnl_link_ops *ops, char *devname)
1003 {
1004         struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1005         struct ip_tunnel_parm parms;
1006         unsigned int i;
1007
1008         itn->rtnl_link_ops = ops;
1009         for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1010                 INIT_HLIST_HEAD(&itn->tunnels[i]);
1011
1012         if (!ops || !net_has_fallback_tunnels(net)) {
1013                 struct ip_tunnel_net *it_init_net;
1014
1015                 it_init_net = net_generic(&init_net, ip_tnl_net_id);
1016                 itn->type = it_init_net->type;
1017                 itn->fb_tunnel_dev = NULL;
1018                 return 0;
1019         }
1020
1021         memset(&parms, 0, sizeof(parms));
1022         if (devname)
1023                 strlcpy(parms.name, devname, IFNAMSIZ);
1024
1025         rtnl_lock();
1026         itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1027         /* FB netdevice is special: we have one, and only one per netns.
1028          * Allowing to move it to another netns is clearly unsafe.
1029          */
1030         if (!IS_ERR(itn->fb_tunnel_dev)) {
1031                 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1032                 itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1033                 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1034                 itn->type = itn->fb_tunnel_dev->type;
1035         }
1036         rtnl_unlock();
1037
1038         return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1039 }
1040 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1041
1042 static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
1043                               struct list_head *head,
1044                               struct rtnl_link_ops *ops)
1045 {
1046         struct net_device *dev, *aux;
1047         int h;
1048
1049         for_each_netdev_safe(net, dev, aux)
1050                 if (dev->rtnl_link_ops == ops)
1051                         unregister_netdevice_queue(dev, head);
1052
1053         for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1054                 struct ip_tunnel *t;
1055                 struct hlist_node *n;
1056                 struct hlist_head *thead = &itn->tunnels[h];
1057
1058                 hlist_for_each_entry_safe(t, n, thead, hash_node)
1059                         /* If dev is in the same netns, it has already
1060                          * been added to the list by the previous loop.
1061                          */
1062                         if (!net_eq(dev_net(t->dev), net))
1063                                 unregister_netdevice_queue(t->dev, head);
1064         }
1065 }
1066
1067 void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
1068                            struct rtnl_link_ops *ops)
1069 {
1070         struct ip_tunnel_net *itn;
1071         struct net *net;
1072         LIST_HEAD(list);
1073
1074         rtnl_lock();
1075         list_for_each_entry(net, net_list, exit_list) {
1076                 itn = net_generic(net, id);
1077                 ip_tunnel_destroy(net, itn, &list, ops);
1078         }
1079         unregister_netdevice_many(&list);
1080         rtnl_unlock();
1081 }
1082 EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
1083
1084 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1085                       struct ip_tunnel_parm *p, __u32 fwmark)
1086 {
1087         struct ip_tunnel *nt;
1088         struct net *net = dev_net(dev);
1089         struct ip_tunnel_net *itn;
1090         int mtu;
1091         int err;
1092
1093         nt = netdev_priv(dev);
1094         itn = net_generic(net, nt->ip_tnl_net_id);
1095
1096         if (nt->collect_md) {
1097                 if (rtnl_dereference(itn->collect_md_tun))
1098                         return -EEXIST;
1099         } else {
1100                 if (ip_tunnel_find(itn, p, dev->type))
1101                         return -EEXIST;
1102         }
1103
1104         nt->net = net;
1105         nt->parms = *p;
1106         nt->fwmark = fwmark;
1107         err = register_netdevice(dev);
1108         if (err)
1109                 goto err_register_netdevice;
1110
1111         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1112                 eth_hw_addr_random(dev);
1113
1114         mtu = ip_tunnel_bind_dev(dev);
1115         if (tb[IFLA_MTU]) {
1116                 unsigned int max = IP_MAX_MTU - dev->hard_header_len - nt->hlen;
1117
1118                 mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU,
1119                             (unsigned int)(max - sizeof(struct iphdr)));
1120         }
1121
1122         err = dev_set_mtu(dev, mtu);
1123         if (err)
1124                 goto err_dev_set_mtu;
1125
1126         ip_tunnel_add(itn, nt);
1127         return 0;
1128
1129 err_dev_set_mtu:
1130         unregister_netdevice(dev);
1131 err_register_netdevice:
1132         return err;
1133 }
1134 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1135
1136 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1137                          struct ip_tunnel_parm *p, __u32 fwmark)
1138 {
1139         struct ip_tunnel *t;
1140         struct ip_tunnel *tunnel = netdev_priv(dev);
1141         struct net *net = tunnel->net;
1142         struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1143
1144         if (dev == itn->fb_tunnel_dev)
1145                 return -EINVAL;
1146
1147         t = ip_tunnel_find(itn, p, dev->type);
1148
1149         if (t) {
1150                 if (t->dev != dev)
1151                         return -EEXIST;
1152         } else {
1153                 t = tunnel;
1154
1155                 if (dev->type != ARPHRD_ETHER) {
1156                         unsigned int nflags = 0;
1157
1158                         if (ipv4_is_multicast(p->iph.daddr))
1159                                 nflags = IFF_BROADCAST;
1160                         else if (p->iph.daddr)
1161                                 nflags = IFF_POINTOPOINT;
1162
1163                         if ((dev->flags ^ nflags) &
1164                             (IFF_POINTOPOINT | IFF_BROADCAST))
1165                                 return -EINVAL;
1166                 }
1167         }
1168
1169         ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
1170         return 0;
1171 }
1172 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1173
1174 int ip_tunnel_init(struct net_device *dev)
1175 {
1176         struct ip_tunnel *tunnel = netdev_priv(dev);
1177         struct iphdr *iph = &tunnel->parms.iph;
1178         int err;
1179
1180         dev->needs_free_netdev = true;
1181         dev->priv_destructor = ip_tunnel_dev_free;
1182         dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1183         if (!dev->tstats)
1184                 return -ENOMEM;
1185
1186         err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1187         if (err) {
1188                 free_percpu(dev->tstats);
1189                 return err;
1190         }
1191
1192         err = gro_cells_init(&tunnel->gro_cells, dev);
1193         if (err) {
1194                 dst_cache_destroy(&tunnel->dst_cache);
1195                 free_percpu(dev->tstats);
1196                 return err;
1197         }
1198
1199         tunnel->dev = dev;
1200         tunnel->net = dev_net(dev);
1201         strcpy(tunnel->parms.name, dev->name);
1202         iph->version            = 4;
1203         iph->ihl                = 5;
1204
1205         if (tunnel->collect_md) {
1206                 dev->features |= NETIF_F_NETNS_LOCAL;
1207                 netif_keep_dst(dev);
1208         }
1209         return 0;
1210 }
1211 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1212
1213 void ip_tunnel_uninit(struct net_device *dev)
1214 {
1215         struct ip_tunnel *tunnel = netdev_priv(dev);
1216         struct net *net = tunnel->net;
1217         struct ip_tunnel_net *itn;
1218
1219         itn = net_generic(net, tunnel->ip_tnl_net_id);
1220         /* fb_tunnel_dev will be unregisted in net-exit call. */
1221         if (itn->fb_tunnel_dev != dev)
1222                 ip_tunnel_del(itn, netdev_priv(dev));
1223
1224         dst_cache_reset(&tunnel->dst_cache);
1225 }
1226 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1227
1228 /* Do least required initialization, rest of init is done in tunnel_init call */
1229 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1230 {
1231         struct ip_tunnel *tunnel = netdev_priv(dev);
1232         tunnel->ip_tnl_net_id = net_id;
1233 }
1234 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1235
1236 MODULE_LICENSE("GPL");