OSDN Git Service

91ae061d46ac2afedb6a30783e5e37f37c65ece5
[sagit-ice-cold/kernel_xiaomi_msm8998.git] / net / ipv4 / ip_tunnel.c
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/mroute.h>
34 #include <linux/init.h>
35 #include <linux/in6.h>
36 #include <linux/inetdevice.h>
37 #include <linux/igmp.h>
38 #include <linux/netfilter_ipv4.h>
39 #include <linux/etherdevice.h>
40 #include <linux/if_ether.h>
41 #include <linux/if_vlan.h>
42 #include <linux/rculist.h>
43 #include <linux/err.h>
44
45 #include <net/sock.h>
46 #include <net/ip.h>
47 #include <net/icmp.h>
48 #include <net/protocol.h>
49 #include <net/ip_tunnels.h>
50 #include <net/arp.h>
51 #include <net/checksum.h>
52 #include <net/dsfield.h>
53 #include <net/inet_ecn.h>
54 #include <net/xfrm.h>
55 #include <net/net_namespace.h>
56 #include <net/netns/generic.h>
57 #include <net/rtnetlink.h>
58 #include <net/udp.h>
59
60 #if IS_ENABLED(CONFIG_IPV6)
61 #include <net/ipv6.h>
62 #include <net/ip6_fib.h>
63 #include <net/ip6_route.h>
64 #endif
65
66 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
67 {
68         return hash_32((__force u32)key ^ (__force u32)remote,
69                          IP_TNL_HASH_BITS);
70 }
71
72 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
73                                 __be16 flags, __be32 key)
74 {
75         if (p->i_flags & TUNNEL_KEY) {
76                 if (flags & TUNNEL_KEY)
77                         return key == p->i_key;
78                 else
79                         /* key expected, none present */
80                         return false;
81         } else
82                 return !(flags & TUNNEL_KEY);
83 }
84
85 /* Fallback tunnel: no source, no destination, no key, no options
86
87    Tunnel hash table:
88    We require exact key match i.e. if a key is present in packet
89    it will match only tunnel with the same key; if it is not present,
90    it will match only keyless tunnel.
91
92    All keysless packets, if not matched configured keyless tunnels
93    will match fallback tunnel.
94    Given src, dst and key, find appropriate for input tunnel.
95 */
96 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
97                                    int link, __be16 flags,
98                                    __be32 remote, __be32 local,
99                                    __be32 key)
100 {
101         unsigned int hash;
102         struct ip_tunnel *t, *cand = NULL;
103         struct hlist_head *head;
104
105         hash = ip_tunnel_hash(key, remote);
106         head = &itn->tunnels[hash];
107
108         hlist_for_each_entry_rcu(t, head, hash_node) {
109                 if (local != t->parms.iph.saddr ||
110                     remote != t->parms.iph.daddr ||
111                     !(t->dev->flags & IFF_UP))
112                         continue;
113
114                 if (!ip_tunnel_key_match(&t->parms, flags, key))
115                         continue;
116
117                 if (t->parms.link == link)
118                         return t;
119                 else
120                         cand = t;
121         }
122
123         hlist_for_each_entry_rcu(t, head, hash_node) {
124                 if (remote != t->parms.iph.daddr ||
125                     t->parms.iph.saddr != 0 ||
126                     !(t->dev->flags & IFF_UP))
127                         continue;
128
129                 if (!ip_tunnel_key_match(&t->parms, flags, key))
130                         continue;
131
132                 if (t->parms.link == link)
133                         return t;
134                 else if (!cand)
135                         cand = t;
136         }
137
138         hash = ip_tunnel_hash(key, 0);
139         head = &itn->tunnels[hash];
140
141         hlist_for_each_entry_rcu(t, head, hash_node) {
142                 if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
143                     (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
144                         continue;
145
146                 if (!(t->dev->flags & IFF_UP))
147                         continue;
148
149                 if (!ip_tunnel_key_match(&t->parms, flags, key))
150                         continue;
151
152                 if (t->parms.link == link)
153                         return t;
154                 else if (!cand)
155                         cand = t;
156         }
157
158         if (flags & TUNNEL_NO_KEY)
159                 goto skip_key_lookup;
160
161         hlist_for_each_entry_rcu(t, head, hash_node) {
162                 if (t->parms.i_key != key ||
163                     t->parms.iph.saddr != 0 ||
164                     t->parms.iph.daddr != 0 ||
165                     !(t->dev->flags & IFF_UP))
166                         continue;
167
168                 if (t->parms.link == link)
169                         return t;
170                 else if (!cand)
171                         cand = t;
172         }
173
174 skip_key_lookup:
175         if (cand)
176                 return cand;
177
178         t = rcu_dereference(itn->collect_md_tun);
179         if (t)
180                 return t;
181
182         if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
183                 return netdev_priv(itn->fb_tunnel_dev);
184
185         return NULL;
186 }
187 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
188
189 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
190                                     struct ip_tunnel_parm *parms)
191 {
192         unsigned int h;
193         __be32 remote;
194         __be32 i_key = parms->i_key;
195
196         if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
197                 remote = parms->iph.daddr;
198         else
199                 remote = 0;
200
201         if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
202                 i_key = 0;
203
204         h = ip_tunnel_hash(i_key, remote);
205         return &itn->tunnels[h];
206 }
207
208 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
209 {
210         struct hlist_head *head = ip_bucket(itn, &t->parms);
211
212         if (t->collect_md)
213                 rcu_assign_pointer(itn->collect_md_tun, t);
214         hlist_add_head_rcu(&t->hash_node, head);
215 }
216
217 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
218 {
219         if (t->collect_md)
220                 rcu_assign_pointer(itn->collect_md_tun, NULL);
221         hlist_del_init_rcu(&t->hash_node);
222 }
223
224 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
225                                         struct ip_tunnel_parm *parms,
226                                         int type)
227 {
228         __be32 remote = parms->iph.daddr;
229         __be32 local = parms->iph.saddr;
230         __be32 key = parms->i_key;
231         __be16 flags = parms->i_flags;
232         int link = parms->link;
233         struct ip_tunnel *t = NULL;
234         struct hlist_head *head = ip_bucket(itn, parms);
235
236         hlist_for_each_entry_rcu(t, head, hash_node) {
237                 if (local == t->parms.iph.saddr &&
238                     remote == t->parms.iph.daddr &&
239                     link == t->parms.link &&
240                     type == t->dev->type &&
241                     ip_tunnel_key_match(&t->parms, flags, key))
242                         break;
243         }
244         return t;
245 }
246
247 static struct net_device *__ip_tunnel_create(struct net *net,
248                                              const struct rtnl_link_ops *ops,
249                                              struct ip_tunnel_parm *parms)
250 {
251         int err;
252         struct ip_tunnel *tunnel;
253         struct net_device *dev;
254         char name[IFNAMSIZ];
255
256         err = -E2BIG;
257         if (parms->name[0]) {
258                 if (!dev_valid_name(parms->name))
259                         goto failed;
260                 strlcpy(name, parms->name, IFNAMSIZ);
261         } else {
262                 if (strlen(ops->kind) > (IFNAMSIZ - 3))
263                         goto failed;
264                 strcpy(name, ops->kind);
265                 strcat(name, "%d");
266         }
267
268         ASSERT_RTNL();
269         dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
270         if (!dev) {
271                 err = -ENOMEM;
272                 goto failed;
273         }
274         dev_net_set(dev, net);
275
276         dev->rtnl_link_ops = ops;
277
278         tunnel = netdev_priv(dev);
279         tunnel->parms = *parms;
280         tunnel->net = net;
281
282         err = register_netdevice(dev);
283         if (err)
284                 goto failed_free;
285
286         return dev;
287
288 failed_free:
289         free_netdev(dev);
290 failed:
291         return ERR_PTR(err);
292 }
293
294 static inline void init_tunnel_flow(struct flowi4 *fl4,
295                                     int proto,
296                                     __be32 daddr, __be32 saddr,
297                                     __be32 key, __u8 tos, int oif)
298 {
299         memset(fl4, 0, sizeof(*fl4));
300         fl4->flowi4_oif = oif;
301         fl4->daddr = daddr;
302         fl4->saddr = saddr;
303         fl4->flowi4_tos = tos;
304         fl4->flowi4_proto = proto;
305         fl4->fl4_gre_key = key;
306 }
307
308 static int ip_tunnel_bind_dev(struct net_device *dev)
309 {
310         struct net_device *tdev = NULL;
311         struct ip_tunnel *tunnel = netdev_priv(dev);
312         const struct iphdr *iph;
313         int hlen = LL_MAX_HEADER;
314         int mtu = ETH_DATA_LEN;
315         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
316
317         iph = &tunnel->parms.iph;
318
319         /* Guess output device to choose reasonable mtu and needed_headroom */
320         if (iph->daddr) {
321                 struct flowi4 fl4;
322                 struct rtable *rt;
323
324                 init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
325                                  iph->saddr, tunnel->parms.o_key,
326                                  RT_TOS(iph->tos), tunnel->parms.link);
327                 rt = ip_route_output_key(tunnel->net, &fl4);
328
329                 if (!IS_ERR(rt)) {
330                         tdev = rt->dst.dev;
331                         ip_rt_put(rt);
332                 }
333                 if (dev->type != ARPHRD_ETHER)
334                         dev->flags |= IFF_POINTOPOINT;
335
336                 dst_cache_reset(&tunnel->dst_cache);
337         }
338
339         if (!tdev && tunnel->parms.link)
340                 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
341
342         if (tdev) {
343                 hlen = tdev->hard_header_len + tdev->needed_headroom;
344                 mtu = tdev->mtu;
345         }
346
347         dev->needed_headroom = t_hlen + hlen;
348         mtu -= (dev->hard_header_len + t_hlen);
349
350         if (mtu < IPV4_MIN_MTU)
351                 mtu = IPV4_MIN_MTU;
352
353         return mtu;
354 }
355
356 static struct ip_tunnel *ip_tunnel_create(struct net *net,
357                                           struct ip_tunnel_net *itn,
358                                           struct ip_tunnel_parm *parms)
359 {
360         struct ip_tunnel *nt;
361         struct net_device *dev;
362
363         BUG_ON(!itn->fb_tunnel_dev);
364         dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
365         if (IS_ERR(dev))
366                 return ERR_CAST(dev);
367
368         dev->mtu = ip_tunnel_bind_dev(dev);
369
370         nt = netdev_priv(dev);
371         ip_tunnel_add(itn, nt);
372         return nt;
373 }
374
375 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
376                   const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
377                   bool log_ecn_error)
378 {
379         struct pcpu_sw_netstats *tstats;
380         const struct iphdr *iph = ip_hdr(skb);
381         int err;
382
383 #ifdef CONFIG_NET_IPGRE_BROADCAST
384         if (ipv4_is_multicast(iph->daddr)) {
385                 tunnel->dev->stats.multicast++;
386                 skb->pkt_type = PACKET_BROADCAST;
387         }
388 #endif
389
390         if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
391              ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
392                 tunnel->dev->stats.rx_crc_errors++;
393                 tunnel->dev->stats.rx_errors++;
394                 goto drop;
395         }
396
397         if (tunnel->parms.i_flags&TUNNEL_SEQ) {
398                 if (!(tpi->flags&TUNNEL_SEQ) ||
399                     (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
400                         tunnel->dev->stats.rx_fifo_errors++;
401                         tunnel->dev->stats.rx_errors++;
402                         goto drop;
403                 }
404                 tunnel->i_seqno = ntohl(tpi->seq) + 1;
405         }
406
407         skb_reset_network_header(skb);
408
409         err = IP_ECN_decapsulate(iph, skb);
410         if (unlikely(err)) {
411                 if (log_ecn_error)
412                         net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
413                                         &iph->saddr, iph->tos);
414                 if (err > 1) {
415                         ++tunnel->dev->stats.rx_frame_errors;
416                         ++tunnel->dev->stats.rx_errors;
417                         goto drop;
418                 }
419         }
420
421         tstats = this_cpu_ptr(tunnel->dev->tstats);
422         u64_stats_update_begin(&tstats->syncp);
423         tstats->rx_packets++;
424         tstats->rx_bytes += skb->len;
425         u64_stats_update_end(&tstats->syncp);
426
427         skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
428
429         if (tunnel->dev->type == ARPHRD_ETHER) {
430                 skb->protocol = eth_type_trans(skb, tunnel->dev);
431                 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
432         } else {
433                 skb->dev = tunnel->dev;
434         }
435
436         if (tun_dst)
437                 skb_dst_set(skb, (struct dst_entry *)tun_dst);
438
439         gro_cells_receive(&tunnel->gro_cells, skb);
440         return 0;
441
442 drop:
443         kfree_skb(skb);
444         return 0;
445 }
446 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
447
448 static int ip_encap_hlen(struct ip_tunnel_encap *e)
449 {
450         const struct ip_tunnel_encap_ops *ops;
451         int hlen = -EINVAL;
452
453         if (e->type == TUNNEL_ENCAP_NONE)
454                 return 0;
455
456         if (e->type >= MAX_IPTUN_ENCAP_OPS)
457                 return -EINVAL;
458
459         rcu_read_lock();
460         ops = rcu_dereference(iptun_encaps[e->type]);
461         if (likely(ops && ops->encap_hlen))
462                 hlen = ops->encap_hlen(e);
463         rcu_read_unlock();
464
465         return hlen;
466 }
467
468 const struct ip_tunnel_encap_ops __rcu *
469                 iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
470
471 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
472                             unsigned int num)
473 {
474         if (num >= MAX_IPTUN_ENCAP_OPS)
475                 return -ERANGE;
476
477         return !cmpxchg((const struct ip_tunnel_encap_ops **)
478                         &iptun_encaps[num],
479                         NULL, ops) ? 0 : -1;
480 }
481 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
482
483 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
484                             unsigned int num)
485 {
486         int ret;
487
488         if (num >= MAX_IPTUN_ENCAP_OPS)
489                 return -ERANGE;
490
491         ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
492                        &iptun_encaps[num],
493                        ops, NULL) == ops) ? 0 : -1;
494
495         synchronize_net();
496
497         return ret;
498 }
499 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
500
501 int ip_tunnel_encap_setup(struct ip_tunnel *t,
502                           struct ip_tunnel_encap *ipencap)
503 {
504         int hlen;
505
506         memset(&t->encap, 0, sizeof(t->encap));
507
508         hlen = ip_encap_hlen(ipencap);
509         if (hlen < 0)
510                 return hlen;
511
512         t->encap.type = ipencap->type;
513         t->encap.sport = ipencap->sport;
514         t->encap.dport = ipencap->dport;
515         t->encap.flags = ipencap->flags;
516
517         t->encap_hlen = hlen;
518         t->hlen = t->encap_hlen + t->tun_hlen;
519
520         return 0;
521 }
522 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
523
524 int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t,
525                     u8 *protocol, struct flowi4 *fl4)
526 {
527         const struct ip_tunnel_encap_ops *ops;
528         int ret = -EINVAL;
529
530         if (t->encap.type == TUNNEL_ENCAP_NONE)
531                 return 0;
532
533         if (t->encap.type >= MAX_IPTUN_ENCAP_OPS)
534                 return -EINVAL;
535
536         rcu_read_lock();
537         ops = rcu_dereference(iptun_encaps[t->encap.type]);
538         if (likely(ops && ops->build_header))
539                 ret = ops->build_header(skb, &t->encap, protocol, fl4);
540         rcu_read_unlock();
541
542         return ret;
543 }
544 EXPORT_SYMBOL(ip_tunnel_encap);
545
546 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
547                             struct rtable *rt, __be16 df,
548                             const struct iphdr *inner_iph)
549 {
550         struct ip_tunnel *tunnel = netdev_priv(dev);
551         int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
552         int mtu;
553
554         if (df)
555                 mtu = dst_mtu(&rt->dst) - dev->hard_header_len
556                                         - sizeof(struct iphdr) - tunnel->hlen;
557         else
558                 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
559
560         if (skb_dst(skb))
561                 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
562
563         if (skb->protocol == htons(ETH_P_IP)) {
564                 if (!skb_is_gso(skb) &&
565                     (inner_iph->frag_off & htons(IP_DF)) &&
566                     mtu < pkt_size) {
567                         memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
568                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
569                         return -E2BIG;
570                 }
571         }
572 #if IS_ENABLED(CONFIG_IPV6)
573         else if (skb->protocol == htons(ETH_P_IPV6)) {
574                 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
575
576                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
577                            mtu >= IPV6_MIN_MTU) {
578                         if ((tunnel->parms.iph.daddr &&
579                             !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
580                             rt6->rt6i_dst.plen == 128) {
581                                 rt6->rt6i_flags |= RTF_MODIFIED;
582                                 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
583                         }
584                 }
585
586                 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
587                                         mtu < pkt_size) {
588                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
589                         return -E2BIG;
590                 }
591         }
592 #endif
593         return 0;
594 }
595
596 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
597                     const struct iphdr *tnl_params, u8 protocol)
598 {
599         struct ip_tunnel *tunnel = netdev_priv(dev);
600         unsigned int inner_nhdr_len = 0;
601         const struct iphdr *inner_iph;
602         struct flowi4 fl4;
603         u8     tos, ttl;
604         __be16 df;
605         struct rtable *rt;              /* Route to the other host */
606         unsigned int max_headroom;      /* The extra header space needed */
607         __be32 dst;
608         int err;
609         bool connected;
610
611         /* ensure we can access the inner net header, for several users below */
612         if (skb->protocol == htons(ETH_P_IP))
613                 inner_nhdr_len = sizeof(struct iphdr);
614         else if (skb->protocol == htons(ETH_P_IPV6))
615                 inner_nhdr_len = sizeof(struct ipv6hdr);
616         if (unlikely(!pskb_may_pull(skb, inner_nhdr_len)))
617                 goto tx_error;
618
619         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
620         connected = (tunnel->parms.iph.daddr != 0);
621
622         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
623
624         dst = tnl_params->daddr;
625         if (dst == 0) {
626                 /* NBMA tunnel */
627
628                 if (!skb_dst(skb)) {
629                         dev->stats.tx_fifo_errors++;
630                         goto tx_error;
631                 }
632
633                 if (skb->protocol == htons(ETH_P_IP)) {
634                         rt = skb_rtable(skb);
635                         dst = rt_nexthop(rt, inner_iph->daddr);
636                 }
637 #if IS_ENABLED(CONFIG_IPV6)
638                 else if (skb->protocol == htons(ETH_P_IPV6)) {
639                         const struct in6_addr *addr6;
640                         struct neighbour *neigh;
641                         bool do_tx_error_icmp;
642                         int addr_type;
643
644                         neigh = dst_neigh_lookup(skb_dst(skb),
645                                                  &ipv6_hdr(skb)->daddr);
646                         if (!neigh)
647                                 goto tx_error;
648
649                         addr6 = (const struct in6_addr *)&neigh->primary_key;
650                         addr_type = ipv6_addr_type(addr6);
651
652                         if (addr_type == IPV6_ADDR_ANY) {
653                                 addr6 = &ipv6_hdr(skb)->daddr;
654                                 addr_type = ipv6_addr_type(addr6);
655                         }
656
657                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
658                                 do_tx_error_icmp = true;
659                         else {
660                                 do_tx_error_icmp = false;
661                                 dst = addr6->s6_addr32[3];
662                         }
663                         neigh_release(neigh);
664                         if (do_tx_error_icmp)
665                                 goto tx_error_icmp;
666                 }
667 #endif
668                 else
669                         goto tx_error;
670
671                 connected = false;
672         }
673
674         tos = tnl_params->tos;
675         if (tos & 0x1) {
676                 tos &= ~0x1;
677                 if (skb->protocol == htons(ETH_P_IP)) {
678                         tos = inner_iph->tos;
679                         connected = false;
680                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
681                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
682                         connected = false;
683                 }
684         }
685
686         init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
687                          tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
688
689         if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
690                 goto tx_error;
691
692         rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr) :
693                          NULL;
694
695         if (!rt) {
696                 rt = ip_route_output_key(tunnel->net, &fl4);
697
698                 if (IS_ERR(rt)) {
699                         dev->stats.tx_carrier_errors++;
700                         goto tx_error;
701                 }
702                 if (connected)
703                         dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
704                                           fl4.saddr);
705         }
706
707         if (rt->dst.dev == dev) {
708                 ip_rt_put(rt);
709                 dev->stats.collisions++;
710                 goto tx_error;
711         }
712
713         if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph)) {
714                 ip_rt_put(rt);
715                 goto tx_error;
716         }
717
718         if (tunnel->err_count > 0) {
719                 if (time_before(jiffies,
720                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
721                         tunnel->err_count--;
722
723                         dst_link_failure(skb);
724                 } else
725                         tunnel->err_count = 0;
726         }
727
728         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
729         ttl = tnl_params->ttl;
730         if (ttl == 0) {
731                 if (skb->protocol == htons(ETH_P_IP))
732                         ttl = inner_iph->ttl;
733 #if IS_ENABLED(CONFIG_IPV6)
734                 else if (skb->protocol == htons(ETH_P_IPV6))
735                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
736 #endif
737                 else
738                         ttl = ip4_dst_hoplimit(&rt->dst);
739         }
740
741         df = tnl_params->frag_off;
742         if (skb->protocol == htons(ETH_P_IP))
743                 df |= (inner_iph->frag_off&htons(IP_DF));
744
745         max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
746                         + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
747         if (max_headroom > dev->needed_headroom)
748                 dev->needed_headroom = max_headroom;
749
750         if (skb_cow_head(skb, dev->needed_headroom)) {
751                 ip_rt_put(rt);
752                 dev->stats.tx_dropped++;
753                 kfree_skb(skb);
754                 return;
755         }
756
757         err = iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol,
758                             tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)));
759         iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
760
761         return;
762
763 #if IS_ENABLED(CONFIG_IPV6)
764 tx_error_icmp:
765         dst_link_failure(skb);
766 #endif
767 tx_error:
768         dev->stats.tx_errors++;
769         kfree_skb(skb);
770 }
771 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
772
773 static void ip_tunnel_update(struct ip_tunnel_net *itn,
774                              struct ip_tunnel *t,
775                              struct net_device *dev,
776                              struct ip_tunnel_parm *p,
777                              bool set_mtu)
778 {
779         ip_tunnel_del(itn, t);
780         t->parms.iph.saddr = p->iph.saddr;
781         t->parms.iph.daddr = p->iph.daddr;
782         t->parms.i_key = p->i_key;
783         t->parms.o_key = p->o_key;
784         if (dev->type != ARPHRD_ETHER) {
785                 memcpy(dev->dev_addr, &p->iph.saddr, 4);
786                 memcpy(dev->broadcast, &p->iph.daddr, 4);
787         }
788         ip_tunnel_add(itn, t);
789
790         t->parms.iph.ttl = p->iph.ttl;
791         t->parms.iph.tos = p->iph.tos;
792         t->parms.iph.frag_off = p->iph.frag_off;
793
794         if (t->parms.link != p->link) {
795                 int mtu;
796
797                 t->parms.link = p->link;
798                 mtu = ip_tunnel_bind_dev(dev);
799                 if (set_mtu)
800                         dev->mtu = mtu;
801         }
802         dst_cache_reset(&t->dst_cache);
803         netdev_state_change(dev);
804 }
805
806 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
807 {
808         int err = 0;
809         struct ip_tunnel *t = netdev_priv(dev);
810         struct net *net = t->net;
811         struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
812
813         BUG_ON(!itn->fb_tunnel_dev);
814         switch (cmd) {
815         case SIOCGETTUNNEL:
816                 if (dev == itn->fb_tunnel_dev) {
817                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
818                         if (!t)
819                                 t = netdev_priv(dev);
820                 }
821                 memcpy(p, &t->parms, sizeof(*p));
822                 break;
823
824         case SIOCADDTUNNEL:
825         case SIOCCHGTUNNEL:
826                 err = -EPERM;
827                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
828                         goto done;
829                 if (p->iph.ttl)
830                         p->iph.frag_off |= htons(IP_DF);
831                 if (!(p->i_flags & VTI_ISVTI)) {
832                         if (!(p->i_flags & TUNNEL_KEY))
833                                 p->i_key = 0;
834                         if (!(p->o_flags & TUNNEL_KEY))
835                                 p->o_key = 0;
836                 }
837
838                 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
839
840                 if (cmd == SIOCADDTUNNEL) {
841                         if (!t) {
842                                 t = ip_tunnel_create(net, itn, p);
843                                 err = PTR_ERR_OR_ZERO(t);
844                                 break;
845                         }
846
847                         err = -EEXIST;
848                         break;
849                 }
850                 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
851                         if (t) {
852                                 if (t->dev != dev) {
853                                         err = -EEXIST;
854                                         break;
855                                 }
856                         } else {
857                                 unsigned int nflags = 0;
858
859                                 if (ipv4_is_multicast(p->iph.daddr))
860                                         nflags = IFF_BROADCAST;
861                                 else if (p->iph.daddr)
862                                         nflags = IFF_POINTOPOINT;
863
864                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
865                                         err = -EINVAL;
866                                         break;
867                                 }
868
869                                 t = netdev_priv(dev);
870                         }
871                 }
872
873                 if (t) {
874                         err = 0;
875                         ip_tunnel_update(itn, t, dev, p, true);
876                 } else {
877                         err = -ENOENT;
878                 }
879                 break;
880
881         case SIOCDELTUNNEL:
882                 err = -EPERM;
883                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
884                         goto done;
885
886                 if (dev == itn->fb_tunnel_dev) {
887                         err = -ENOENT;
888                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
889                         if (!t)
890                                 goto done;
891                         err = -EPERM;
892                         if (t == netdev_priv(itn->fb_tunnel_dev))
893                                 goto done;
894                         dev = t->dev;
895                 }
896                 unregister_netdevice(dev);
897                 err = 0;
898                 break;
899
900         default:
901                 err = -EINVAL;
902         }
903
904 done:
905         return err;
906 }
907 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
908
909 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
910 {
911         struct ip_tunnel *tunnel = netdev_priv(dev);
912         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
913         int max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen;
914
915         if (new_mtu < 68)
916                 return -EINVAL;
917
918         if (new_mtu > max_mtu) {
919                 if (strict)
920                         return -EINVAL;
921
922                 new_mtu = max_mtu;
923         }
924
925         dev->mtu = new_mtu;
926         return 0;
927 }
928 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
929
930 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
931 {
932         return __ip_tunnel_change_mtu(dev, new_mtu, true);
933 }
934 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
935
936 static void ip_tunnel_dev_free(struct net_device *dev)
937 {
938         struct ip_tunnel *tunnel = netdev_priv(dev);
939
940         gro_cells_destroy(&tunnel->gro_cells);
941         dst_cache_destroy(&tunnel->dst_cache);
942         free_percpu(dev->tstats);
943         free_netdev(dev);
944 }
945
946 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
947 {
948         struct ip_tunnel *tunnel = netdev_priv(dev);
949         struct ip_tunnel_net *itn;
950
951         itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
952
953         if (itn->fb_tunnel_dev != dev) {
954                 ip_tunnel_del(itn, netdev_priv(dev));
955                 unregister_netdevice_queue(dev, head);
956         }
957 }
958 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
959
960 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
961 {
962         struct ip_tunnel *tunnel = netdev_priv(dev);
963
964         return tunnel->net;
965 }
966 EXPORT_SYMBOL(ip_tunnel_get_link_net);
967
968 int ip_tunnel_get_iflink(const struct net_device *dev)
969 {
970         struct ip_tunnel *tunnel = netdev_priv(dev);
971
972         return tunnel->parms.link;
973 }
974 EXPORT_SYMBOL(ip_tunnel_get_iflink);
975
976 int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
977                                   struct rtnl_link_ops *ops, char *devname)
978 {
979         struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
980         struct ip_tunnel_parm parms;
981         unsigned int i;
982
983         for (i = 0; i < IP_TNL_HASH_SIZE; i++)
984                 INIT_HLIST_HEAD(&itn->tunnels[i]);
985
986         if (!ops) {
987                 itn->fb_tunnel_dev = NULL;
988                 return 0;
989         }
990
991         memset(&parms, 0, sizeof(parms));
992         if (devname)
993                 strlcpy(parms.name, devname, IFNAMSIZ);
994
995         rtnl_lock();
996         itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
997         /* FB netdevice is special: we have one, and only one per netns.
998          * Allowing to move it to another netns is clearly unsafe.
999          */
1000         if (!IS_ERR(itn->fb_tunnel_dev)) {
1001                 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1002                 itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1003                 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1004         }
1005         rtnl_unlock();
1006
1007         return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1008 }
1009 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1010
1011 static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
1012                               struct rtnl_link_ops *ops)
1013 {
1014         struct net *net = dev_net(itn->fb_tunnel_dev);
1015         struct net_device *dev, *aux;
1016         int h;
1017
1018         for_each_netdev_safe(net, dev, aux)
1019                 if (dev->rtnl_link_ops == ops)
1020                         unregister_netdevice_queue(dev, head);
1021
1022         for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1023                 struct ip_tunnel *t;
1024                 struct hlist_node *n;
1025                 struct hlist_head *thead = &itn->tunnels[h];
1026
1027                 hlist_for_each_entry_safe(t, n, thead, hash_node)
1028                         /* If dev is in the same netns, it has already
1029                          * been added to the list by the previous loop.
1030                          */
1031                         if (!net_eq(dev_net(t->dev), net))
1032                                 unregister_netdevice_queue(t->dev, head);
1033         }
1034 }
1035
1036 void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
1037 {
1038         LIST_HEAD(list);
1039
1040         rtnl_lock();
1041         ip_tunnel_destroy(itn, &list, ops);
1042         unregister_netdevice_many(&list);
1043         rtnl_unlock();
1044 }
1045 EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
1046
1047 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1048                       struct ip_tunnel_parm *p)
1049 {
1050         struct ip_tunnel *nt;
1051         struct net *net = dev_net(dev);
1052         struct ip_tunnel_net *itn;
1053         int mtu;
1054         int err;
1055
1056         nt = netdev_priv(dev);
1057         itn = net_generic(net, nt->ip_tnl_net_id);
1058
1059         if (nt->collect_md) {
1060                 if (rtnl_dereference(itn->collect_md_tun))
1061                         return -EEXIST;
1062         } else {
1063                 if (ip_tunnel_find(itn, p, dev->type))
1064                         return -EEXIST;
1065         }
1066
1067         nt->net = net;
1068         nt->parms = *p;
1069         err = register_netdevice(dev);
1070         if (err)
1071                 goto out;
1072
1073         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1074                 eth_hw_addr_random(dev);
1075
1076         mtu = ip_tunnel_bind_dev(dev);
1077         if (!tb[IFLA_MTU])
1078                 dev->mtu = mtu;
1079
1080         ip_tunnel_add(itn, nt);
1081 out:
1082         return err;
1083 }
1084 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1085
1086 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1087                          struct ip_tunnel_parm *p)
1088 {
1089         struct ip_tunnel *t;
1090         struct ip_tunnel *tunnel = netdev_priv(dev);
1091         struct net *net = tunnel->net;
1092         struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1093
1094         if (dev == itn->fb_tunnel_dev)
1095                 return -EINVAL;
1096
1097         t = ip_tunnel_find(itn, p, dev->type);
1098
1099         if (t) {
1100                 if (t->dev != dev)
1101                         return -EEXIST;
1102         } else {
1103                 t = tunnel;
1104
1105                 if (dev->type != ARPHRD_ETHER) {
1106                         unsigned int nflags = 0;
1107
1108                         if (ipv4_is_multicast(p->iph.daddr))
1109                                 nflags = IFF_BROADCAST;
1110                         else if (p->iph.daddr)
1111                                 nflags = IFF_POINTOPOINT;
1112
1113                         if ((dev->flags ^ nflags) &
1114                             (IFF_POINTOPOINT | IFF_BROADCAST))
1115                                 return -EINVAL;
1116                 }
1117         }
1118
1119         ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
1120         return 0;
1121 }
1122 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1123
1124 int ip_tunnel_init(struct net_device *dev)
1125 {
1126         struct ip_tunnel *tunnel = netdev_priv(dev);
1127         struct iphdr *iph = &tunnel->parms.iph;
1128         int err;
1129
1130         dev->destructor = ip_tunnel_dev_free;
1131         dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1132         if (!dev->tstats)
1133                 return -ENOMEM;
1134
1135         err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1136         if (err) {
1137                 free_percpu(dev->tstats);
1138                 return err;
1139         }
1140
1141         err = gro_cells_init(&tunnel->gro_cells, dev);
1142         if (err) {
1143                 dst_cache_destroy(&tunnel->dst_cache);
1144                 free_percpu(dev->tstats);
1145                 return err;
1146         }
1147
1148         tunnel->dev = dev;
1149         tunnel->net = dev_net(dev);
1150         strcpy(tunnel->parms.name, dev->name);
1151         iph->version            = 4;
1152         iph->ihl                = 5;
1153
1154         if (tunnel->collect_md) {
1155                 dev->features |= NETIF_F_NETNS_LOCAL;
1156                 netif_keep_dst(dev);
1157         }
1158         return 0;
1159 }
1160 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1161
1162 void ip_tunnel_uninit(struct net_device *dev)
1163 {
1164         struct ip_tunnel *tunnel = netdev_priv(dev);
1165         struct net *net = tunnel->net;
1166         struct ip_tunnel_net *itn;
1167
1168         itn = net_generic(net, tunnel->ip_tnl_net_id);
1169         /* fb_tunnel_dev will be unregisted in net-exit call. */
1170         if (itn->fb_tunnel_dev != dev)
1171                 ip_tunnel_del(itn, netdev_priv(dev));
1172
1173         dst_cache_reset(&tunnel->dst_cache);
1174 }
1175 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1176
1177 /* Do least required initialization, rest of init is done in tunnel_init call */
1178 void ip_tunnel_setup(struct net_device *dev, int net_id)
1179 {
1180         struct ip_tunnel *tunnel = netdev_priv(dev);
1181         tunnel->ip_tnl_net_id = net_id;
1182 }
1183 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1184
1185 MODULE_LICENSE("GPL");