OSDN Git Service

Merge branch 'master' of git://1984.lsi.us.es/nf-next
[uclinux-h8/linux.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
48 #include <net/snmp.h>
49 #include <net/ipv6.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
54 #include <net/tcp.h>
55 #include <linux/rtnetlink.h>
56 #include <net/dst.h>
57 #include <net/xfrm.h>
58 #include <net/netevent.h>
59 #include <net/netlink.h>
60
61 #include <asm/uaccess.h>
62
63 #ifdef CONFIG_SYSCTL
64 #include <linux/sysctl.h>
65 #endif
66
67 static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
68                                     const struct in6_addr *dest);
69 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
70 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
71 static unsigned int      ip6_mtu(const struct dst_entry *dst);
72 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
73 static void             ip6_dst_destroy(struct dst_entry *);
74 static void             ip6_dst_ifdown(struct dst_entry *,
75                                        struct net_device *dev, int how);
76 static int               ip6_dst_gc(struct dst_ops *ops);
77
78 static int              ip6_pkt_discard(struct sk_buff *skb);
79 static int              ip6_pkt_discard_out(struct sk_buff *skb);
80 static void             ip6_link_failure(struct sk_buff *skb);
81 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
82                                            struct sk_buff *skb, u32 mtu);
83 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
84                                         struct sk_buff *skb);
85
86 #ifdef CONFIG_IPV6_ROUTE_INFO
87 static struct rt6_info *rt6_add_route_info(struct net *net,
88                                            const struct in6_addr *prefix, int prefixlen,
89                                            const struct in6_addr *gwaddr, int ifindex,
90                                            unsigned int pref);
91 static struct rt6_info *rt6_get_route_info(struct net *net,
92                                            const struct in6_addr *prefix, int prefixlen,
93                                            const struct in6_addr *gwaddr, int ifindex);
94 #endif
95
96 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
97 {
98         struct rt6_info *rt = (struct rt6_info *) dst;
99         struct inet_peer *peer;
100         u32 *p = NULL;
101
102         if (!(rt->dst.flags & DST_HOST))
103                 return NULL;
104
105         peer = rt6_get_peer_create(rt);
106         if (peer) {
107                 u32 *old_p = __DST_METRICS_PTR(old);
108                 unsigned long prev, new;
109
110                 p = peer->metrics;
111                 if (inet_metrics_new(peer))
112                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
113
114                 new = (unsigned long) p;
115                 prev = cmpxchg(&dst->_metrics, old, new);
116
117                 if (prev != old) {
118                         p = __DST_METRICS_PTR(prev);
119                         if (prev & DST_METRICS_READ_ONLY)
120                                 p = NULL;
121                 }
122         }
123         return p;
124 }
125
126 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
127                                              struct sk_buff *skb,
128                                              const void *daddr)
129 {
130         struct in6_addr *p = &rt->rt6i_gateway;
131
132         if (!ipv6_addr_any(p))
133                 return (const void *) p;
134         else if (skb)
135                 return &ipv6_hdr(skb)->daddr;
136         return daddr;
137 }
138
139 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
140                                           struct sk_buff *skb,
141                                           const void *daddr)
142 {
143         struct rt6_info *rt = (struct rt6_info *) dst;
144         struct neighbour *n;
145
146         daddr = choose_neigh_daddr(rt, skb, daddr);
147         n = __ipv6_neigh_lookup(&nd_tbl, dst->dev, daddr);
148         if (n)
149                 return n;
150         return neigh_create(&nd_tbl, daddr, dst->dev);
151 }
152
153 static int rt6_bind_neighbour(struct rt6_info *rt, struct net_device *dev)
154 {
155         struct neighbour *n = __ipv6_neigh_lookup(&nd_tbl, dev, &rt->rt6i_gateway);
156         if (!n) {
157                 n = neigh_create(&nd_tbl, &rt->rt6i_gateway, dev);
158                 if (IS_ERR(n))
159                         return PTR_ERR(n);
160         }
161         rt->n = n;
162
163         return 0;
164 }
165
166 static struct dst_ops ip6_dst_ops_template = {
167         .family                 =       AF_INET6,
168         .protocol               =       cpu_to_be16(ETH_P_IPV6),
169         .gc                     =       ip6_dst_gc,
170         .gc_thresh              =       1024,
171         .check                  =       ip6_dst_check,
172         .default_advmss         =       ip6_default_advmss,
173         .mtu                    =       ip6_mtu,
174         .cow_metrics            =       ipv6_cow_metrics,
175         .destroy                =       ip6_dst_destroy,
176         .ifdown                 =       ip6_dst_ifdown,
177         .negative_advice        =       ip6_negative_advice,
178         .link_failure           =       ip6_link_failure,
179         .update_pmtu            =       ip6_rt_update_pmtu,
180         .redirect               =       rt6_do_redirect,
181         .local_out              =       __ip6_local_out,
182         .neigh_lookup           =       ip6_neigh_lookup,
183 };
184
185 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
186 {
187         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
188
189         return mtu ? : dst->dev->mtu;
190 }
191
192 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
193                                          struct sk_buff *skb, u32 mtu)
194 {
195 }
196
197 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
198                                       struct sk_buff *skb)
199 {
200 }
201
202 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
203                                          unsigned long old)
204 {
205         return NULL;
206 }
207
208 static struct dst_ops ip6_dst_blackhole_ops = {
209         .family                 =       AF_INET6,
210         .protocol               =       cpu_to_be16(ETH_P_IPV6),
211         .destroy                =       ip6_dst_destroy,
212         .check                  =       ip6_dst_check,
213         .mtu                    =       ip6_blackhole_mtu,
214         .default_advmss         =       ip6_default_advmss,
215         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
216         .redirect               =       ip6_rt_blackhole_redirect,
217         .cow_metrics            =       ip6_rt_blackhole_cow_metrics,
218         .neigh_lookup           =       ip6_neigh_lookup,
219 };
220
221 static const u32 ip6_template_metrics[RTAX_MAX] = {
222         [RTAX_HOPLIMIT - 1] = 255,
223 };
224
225 static struct rt6_info ip6_null_entry_template = {
226         .dst = {
227                 .__refcnt       = ATOMIC_INIT(1),
228                 .__use          = 1,
229                 .obsolete       = -1,
230                 .error          = -ENETUNREACH,
231                 .input          = ip6_pkt_discard,
232                 .output         = ip6_pkt_discard_out,
233         },
234         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
235         .rt6i_protocol  = RTPROT_KERNEL,
236         .rt6i_metric    = ~(u32) 0,
237         .rt6i_ref       = ATOMIC_INIT(1),
238 };
239
240 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
241
242 static int ip6_pkt_prohibit(struct sk_buff *skb);
243 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
244
245 static struct rt6_info ip6_prohibit_entry_template = {
246         .dst = {
247                 .__refcnt       = ATOMIC_INIT(1),
248                 .__use          = 1,
249                 .obsolete       = -1,
250                 .error          = -EACCES,
251                 .input          = ip6_pkt_prohibit,
252                 .output         = ip6_pkt_prohibit_out,
253         },
254         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
255         .rt6i_protocol  = RTPROT_KERNEL,
256         .rt6i_metric    = ~(u32) 0,
257         .rt6i_ref       = ATOMIC_INIT(1),
258 };
259
260 static struct rt6_info ip6_blk_hole_entry_template = {
261         .dst = {
262                 .__refcnt       = ATOMIC_INIT(1),
263                 .__use          = 1,
264                 .obsolete       = -1,
265                 .error          = -EINVAL,
266                 .input          = dst_discard,
267                 .output         = dst_discard,
268         },
269         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
270         .rt6i_protocol  = RTPROT_KERNEL,
271         .rt6i_metric    = ~(u32) 0,
272         .rt6i_ref       = ATOMIC_INIT(1),
273 };
274
275 #endif
276
277 /* allocate dst with ip6_dst_ops */
278 static inline struct rt6_info *ip6_dst_alloc(struct net *net,
279                                              struct net_device *dev,
280                                              int flags,
281                                              struct fib6_table *table)
282 {
283         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
284                                         0, DST_OBSOLETE_NONE, flags);
285
286         if (rt) {
287                 struct dst_entry *dst = &rt->dst;
288
289                 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
290                 rt6_init_peer(rt, table ? &table->tb6_peers : net->ipv6.peers);
291         }
292         return rt;
293 }
294
295 static void ip6_dst_destroy(struct dst_entry *dst)
296 {
297         struct rt6_info *rt = (struct rt6_info *)dst;
298         struct inet6_dev *idev = rt->rt6i_idev;
299
300         if (rt->n)
301                 neigh_release(rt->n);
302
303         if (!(rt->dst.flags & DST_HOST))
304                 dst_destroy_metrics_generic(dst);
305
306         if (idev) {
307                 rt->rt6i_idev = NULL;
308                 in6_dev_put(idev);
309         }
310
311         if (!(rt->rt6i_flags & RTF_EXPIRES) && dst->from)
312                 dst_release(dst->from);
313
314         if (rt6_has_peer(rt)) {
315                 struct inet_peer *peer = rt6_peer_ptr(rt);
316                 inet_putpeer(peer);
317         }
318 }
319
320 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
321
322 static u32 rt6_peer_genid(void)
323 {
324         return atomic_read(&__rt6_peer_genid);
325 }
326
327 void rt6_bind_peer(struct rt6_info *rt, int create)
328 {
329         struct inet_peer_base *base;
330         struct inet_peer *peer;
331
332         base = inetpeer_base_ptr(rt->_rt6i_peer);
333         if (!base)
334                 return;
335
336         peer = inet_getpeer_v6(base, &rt->rt6i_dst.addr, create);
337         if (peer) {
338                 if (!rt6_set_peer(rt, peer))
339                         inet_putpeer(peer);
340                 else
341                         rt->rt6i_peer_genid = rt6_peer_genid();
342         }
343 }
344
345 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
346                            int how)
347 {
348         struct rt6_info *rt = (struct rt6_info *)dst;
349         struct inet6_dev *idev = rt->rt6i_idev;
350         struct net_device *loopback_dev =
351                 dev_net(dev)->loopback_dev;
352
353         if (dev != loopback_dev) {
354                 if (idev && idev->dev == dev) {
355                         struct inet6_dev *loopback_idev =
356                                 in6_dev_get(loopback_dev);
357                         if (loopback_idev) {
358                                 rt->rt6i_idev = loopback_idev;
359                                 in6_dev_put(idev);
360                         }
361                 }
362                 if (rt->n && rt->n->dev == dev) {
363                         rt->n->dev = loopback_dev;
364                         dev_hold(loopback_dev);
365                         dev_put(dev);
366                 }
367         }
368 }
369
370 static bool rt6_check_expired(const struct rt6_info *rt)
371 {
372         struct rt6_info *ort = NULL;
373
374         if (rt->rt6i_flags & RTF_EXPIRES) {
375                 if (time_after(jiffies, rt->dst.expires))
376                         return true;
377         } else if (rt->dst.from) {
378                 ort = (struct rt6_info *) rt->dst.from;
379                 return (ort->rt6i_flags & RTF_EXPIRES) &&
380                         time_after(jiffies, ort->dst.expires);
381         }
382         return false;
383 }
384
385 static bool rt6_need_strict(const struct in6_addr *daddr)
386 {
387         return ipv6_addr_type(daddr) &
388                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
389 }
390
391 /*
392  *      Route lookup. Any table->tb6_lock is implied.
393  */
394
395 static inline struct rt6_info *rt6_device_match(struct net *net,
396                                                     struct rt6_info *rt,
397                                                     const struct in6_addr *saddr,
398                                                     int oif,
399                                                     int flags)
400 {
401         struct rt6_info *local = NULL;
402         struct rt6_info *sprt;
403
404         if (!oif && ipv6_addr_any(saddr))
405                 goto out;
406
407         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
408                 struct net_device *dev = sprt->dst.dev;
409
410                 if (oif) {
411                         if (dev->ifindex == oif)
412                                 return sprt;
413                         if (dev->flags & IFF_LOOPBACK) {
414                                 if (!sprt->rt6i_idev ||
415                                     sprt->rt6i_idev->dev->ifindex != oif) {
416                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
417                                                 continue;
418                                         if (local && (!oif ||
419                                                       local->rt6i_idev->dev->ifindex == oif))
420                                                 continue;
421                                 }
422                                 local = sprt;
423                         }
424                 } else {
425                         if (ipv6_chk_addr(net, saddr, dev,
426                                           flags & RT6_LOOKUP_F_IFACE))
427                                 return sprt;
428                 }
429         }
430
431         if (oif) {
432                 if (local)
433                         return local;
434
435                 if (flags & RT6_LOOKUP_F_IFACE)
436                         return net->ipv6.ip6_null_entry;
437         }
438 out:
439         return rt;
440 }
441
442 #ifdef CONFIG_IPV6_ROUTER_PREF
443 static void rt6_probe(struct rt6_info *rt)
444 {
445         struct neighbour *neigh;
446         /*
447          * Okay, this does not seem to be appropriate
448          * for now, however, we need to check if it
449          * is really so; aka Router Reachability Probing.
450          *
451          * Router Reachability Probe MUST be rate-limited
452          * to no more than one per minute.
453          */
454         neigh = rt ? rt->n : NULL;
455         if (!neigh || (neigh->nud_state & NUD_VALID))
456                 return;
457         read_lock_bh(&neigh->lock);
458         if (!(neigh->nud_state & NUD_VALID) &&
459             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
460                 struct in6_addr mcaddr;
461                 struct in6_addr *target;
462
463                 neigh->updated = jiffies;
464                 read_unlock_bh(&neigh->lock);
465
466                 target = (struct in6_addr *)&neigh->primary_key;
467                 addrconf_addr_solict_mult(target, &mcaddr);
468                 ndisc_send_ns(rt->dst.dev, NULL, target, &mcaddr, NULL);
469         } else {
470                 read_unlock_bh(&neigh->lock);
471         }
472 }
473 #else
474 static inline void rt6_probe(struct rt6_info *rt)
475 {
476 }
477 #endif
478
479 /*
480  * Default Router Selection (RFC 2461 6.3.6)
481  */
482 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
483 {
484         struct net_device *dev = rt->dst.dev;
485         if (!oif || dev->ifindex == oif)
486                 return 2;
487         if ((dev->flags & IFF_LOOPBACK) &&
488             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
489                 return 1;
490         return 0;
491 }
492
493 static inline int rt6_check_neigh(struct rt6_info *rt)
494 {
495         struct neighbour *neigh;
496         int m;
497
498         neigh = rt->n;
499         if (rt->rt6i_flags & RTF_NONEXTHOP ||
500             !(rt->rt6i_flags & RTF_GATEWAY))
501                 m = 1;
502         else if (neigh) {
503                 read_lock_bh(&neigh->lock);
504                 if (neigh->nud_state & NUD_VALID)
505                         m = 2;
506 #ifdef CONFIG_IPV6_ROUTER_PREF
507                 else if (neigh->nud_state & NUD_FAILED)
508                         m = 0;
509 #endif
510                 else
511                         m = 1;
512                 read_unlock_bh(&neigh->lock);
513         } else
514                 m = 0;
515         return m;
516 }
517
518 static int rt6_score_route(struct rt6_info *rt, int oif,
519                            int strict)
520 {
521         int m, n;
522
523         m = rt6_check_dev(rt, oif);
524         if (!m && (strict & RT6_LOOKUP_F_IFACE))
525                 return -1;
526 #ifdef CONFIG_IPV6_ROUTER_PREF
527         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
528 #endif
529         n = rt6_check_neigh(rt);
530         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
531                 return -1;
532         return m;
533 }
534
535 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
536                                    int *mpri, struct rt6_info *match)
537 {
538         int m;
539
540         if (rt6_check_expired(rt))
541                 goto out;
542
543         m = rt6_score_route(rt, oif, strict);
544         if (m < 0)
545                 goto out;
546
547         if (m > *mpri) {
548                 if (strict & RT6_LOOKUP_F_REACHABLE)
549                         rt6_probe(match);
550                 *mpri = m;
551                 match = rt;
552         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
553                 rt6_probe(rt);
554         }
555
556 out:
557         return match;
558 }
559
560 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
561                                      struct rt6_info *rr_head,
562                                      u32 metric, int oif, int strict)
563 {
564         struct rt6_info *rt, *match;
565         int mpri = -1;
566
567         match = NULL;
568         for (rt = rr_head; rt && rt->rt6i_metric == metric;
569              rt = rt->dst.rt6_next)
570                 match = find_match(rt, oif, strict, &mpri, match);
571         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
572              rt = rt->dst.rt6_next)
573                 match = find_match(rt, oif, strict, &mpri, match);
574
575         return match;
576 }
577
578 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
579 {
580         struct rt6_info *match, *rt0;
581         struct net *net;
582
583         rt0 = fn->rr_ptr;
584         if (!rt0)
585                 fn->rr_ptr = rt0 = fn->leaf;
586
587         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
588
589         if (!match &&
590             (strict & RT6_LOOKUP_F_REACHABLE)) {
591                 struct rt6_info *next = rt0->dst.rt6_next;
592
593                 /* no entries matched; do round-robin */
594                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
595                         next = fn->leaf;
596
597                 if (next != rt0)
598                         fn->rr_ptr = next;
599         }
600
601         net = dev_net(rt0->dst.dev);
602         return match ? match : net->ipv6.ip6_null_entry;
603 }
604
605 #ifdef CONFIG_IPV6_ROUTE_INFO
606 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
607                   const struct in6_addr *gwaddr)
608 {
609         struct net *net = dev_net(dev);
610         struct route_info *rinfo = (struct route_info *) opt;
611         struct in6_addr prefix_buf, *prefix;
612         unsigned int pref;
613         unsigned long lifetime;
614         struct rt6_info *rt;
615
616         if (len < sizeof(struct route_info)) {
617                 return -EINVAL;
618         }
619
620         /* Sanity check for prefix_len and length */
621         if (rinfo->length > 3) {
622                 return -EINVAL;
623         } else if (rinfo->prefix_len > 128) {
624                 return -EINVAL;
625         } else if (rinfo->prefix_len > 64) {
626                 if (rinfo->length < 2) {
627                         return -EINVAL;
628                 }
629         } else if (rinfo->prefix_len > 0) {
630                 if (rinfo->length < 1) {
631                         return -EINVAL;
632                 }
633         }
634
635         pref = rinfo->route_pref;
636         if (pref == ICMPV6_ROUTER_PREF_INVALID)
637                 return -EINVAL;
638
639         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
640
641         if (rinfo->length == 3)
642                 prefix = (struct in6_addr *)rinfo->prefix;
643         else {
644                 /* this function is safe */
645                 ipv6_addr_prefix(&prefix_buf,
646                                  (struct in6_addr *)rinfo->prefix,
647                                  rinfo->prefix_len);
648                 prefix = &prefix_buf;
649         }
650
651         rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
652                                 dev->ifindex);
653
654         if (rt && !lifetime) {
655                 ip6_del_rt(rt);
656                 rt = NULL;
657         }
658
659         if (!rt && lifetime)
660                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
661                                         pref);
662         else if (rt)
663                 rt->rt6i_flags = RTF_ROUTEINFO |
664                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
665
666         if (rt) {
667                 if (!addrconf_finite_timeout(lifetime))
668                         rt6_clean_expires(rt);
669                 else
670                         rt6_set_expires(rt, jiffies + HZ * lifetime);
671
672                 dst_release(&rt->dst);
673         }
674         return 0;
675 }
676 #endif
677
678 #define BACKTRACK(__net, saddr)                 \
679 do { \
680         if (rt == __net->ipv6.ip6_null_entry) { \
681                 struct fib6_node *pn; \
682                 while (1) { \
683                         if (fn->fn_flags & RTN_TL_ROOT) \
684                                 goto out; \
685                         pn = fn->parent; \
686                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
687                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
688                         else \
689                                 fn = pn; \
690                         if (fn->fn_flags & RTN_RTINFO) \
691                                 goto restart; \
692                 } \
693         } \
694 } while (0)
695
696 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
697                                              struct fib6_table *table,
698                                              struct flowi6 *fl6, int flags)
699 {
700         struct fib6_node *fn;
701         struct rt6_info *rt;
702
703         read_lock_bh(&table->tb6_lock);
704         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
705 restart:
706         rt = fn->leaf;
707         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
708         BACKTRACK(net, &fl6->saddr);
709 out:
710         dst_use(&rt->dst, jiffies);
711         read_unlock_bh(&table->tb6_lock);
712         return rt;
713
714 }
715
716 struct dst_entry * ip6_route_lookup(struct net *net, struct flowi6 *fl6,
717                                     int flags)
718 {
719         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
720 }
721 EXPORT_SYMBOL_GPL(ip6_route_lookup);
722
723 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
724                             const struct in6_addr *saddr, int oif, int strict)
725 {
726         struct flowi6 fl6 = {
727                 .flowi6_oif = oif,
728                 .daddr = *daddr,
729         };
730         struct dst_entry *dst;
731         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
732
733         if (saddr) {
734                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
735                 flags |= RT6_LOOKUP_F_HAS_SADDR;
736         }
737
738         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
739         if (dst->error == 0)
740                 return (struct rt6_info *) dst;
741
742         dst_release(dst);
743
744         return NULL;
745 }
746
747 EXPORT_SYMBOL(rt6_lookup);
748
749 /* ip6_ins_rt is called with FREE table->tb6_lock.
750    It takes new route entry, the addition fails by any reason the
751    route is freed. In any case, if caller does not hold it, it may
752    be destroyed.
753  */
754
755 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
756 {
757         int err;
758         struct fib6_table *table;
759
760         table = rt->rt6i_table;
761         write_lock_bh(&table->tb6_lock);
762         err = fib6_add(&table->tb6_root, rt, info);
763         write_unlock_bh(&table->tb6_lock);
764
765         return err;
766 }
767
768 int ip6_ins_rt(struct rt6_info *rt)
769 {
770         struct nl_info info = {
771                 .nl_net = dev_net(rt->dst.dev),
772         };
773         return __ip6_ins_rt(rt, &info);
774 }
775
776 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort,
777                                       const struct in6_addr *daddr,
778                                       const struct in6_addr *saddr)
779 {
780         struct rt6_info *rt;
781
782         /*
783          *      Clone the route.
784          */
785
786         rt = ip6_rt_copy(ort, daddr);
787
788         if (rt) {
789                 int attempts = !in_softirq();
790
791                 if (!(rt->rt6i_flags & RTF_GATEWAY)) {
792                         if (ort->rt6i_dst.plen != 128 &&
793                             ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
794                                 rt->rt6i_flags |= RTF_ANYCAST;
795                         rt->rt6i_gateway = *daddr;
796                 }
797
798                 rt->rt6i_flags |= RTF_CACHE;
799
800 #ifdef CONFIG_IPV6_SUBTREES
801                 if (rt->rt6i_src.plen && saddr) {
802                         rt->rt6i_src.addr = *saddr;
803                         rt->rt6i_src.plen = 128;
804                 }
805 #endif
806
807         retry:
808                 if (rt6_bind_neighbour(rt, rt->dst.dev)) {
809                         struct net *net = dev_net(rt->dst.dev);
810                         int saved_rt_min_interval =
811                                 net->ipv6.sysctl.ip6_rt_gc_min_interval;
812                         int saved_rt_elasticity =
813                                 net->ipv6.sysctl.ip6_rt_gc_elasticity;
814
815                         if (attempts-- > 0) {
816                                 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
817                                 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
818
819                                 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
820
821                                 net->ipv6.sysctl.ip6_rt_gc_elasticity =
822                                         saved_rt_elasticity;
823                                 net->ipv6.sysctl.ip6_rt_gc_min_interval =
824                                         saved_rt_min_interval;
825                                 goto retry;
826                         }
827
828                         net_warn_ratelimited("Neighbour table overflow\n");
829                         dst_free(&rt->dst);
830                         return NULL;
831                 }
832         }
833
834         return rt;
835 }
836
837 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
838                                         const struct in6_addr *daddr)
839 {
840         struct rt6_info *rt = ip6_rt_copy(ort, daddr);
841
842         if (rt) {
843                 rt->rt6i_flags |= RTF_CACHE;
844                 rt->n = neigh_clone(ort->n);
845         }
846         return rt;
847 }
848
849 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
850                                       struct flowi6 *fl6, int flags)
851 {
852         struct fib6_node *fn;
853         struct rt6_info *rt, *nrt;
854         int strict = 0;
855         int attempts = 3;
856         int err;
857         int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
858
859         strict |= flags & RT6_LOOKUP_F_IFACE;
860
861 relookup:
862         read_lock_bh(&table->tb6_lock);
863
864 restart_2:
865         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
866
867 restart:
868         rt = rt6_select(fn, oif, strict | reachable);
869
870         BACKTRACK(net, &fl6->saddr);
871         if (rt == net->ipv6.ip6_null_entry ||
872             rt->rt6i_flags & RTF_CACHE)
873                 goto out;
874
875         dst_hold(&rt->dst);
876         read_unlock_bh(&table->tb6_lock);
877
878         if (!rt->n && !(rt->rt6i_flags & RTF_NONEXTHOP))
879                 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
880         else if (!(rt->dst.flags & DST_HOST))
881                 nrt = rt6_alloc_clone(rt, &fl6->daddr);
882         else
883                 goto out2;
884
885         dst_release(&rt->dst);
886         rt = nrt ? : net->ipv6.ip6_null_entry;
887
888         dst_hold(&rt->dst);
889         if (nrt) {
890                 err = ip6_ins_rt(nrt);
891                 if (!err)
892                         goto out2;
893         }
894
895         if (--attempts <= 0)
896                 goto out2;
897
898         /*
899          * Race condition! In the gap, when table->tb6_lock was
900          * released someone could insert this route.  Relookup.
901          */
902         dst_release(&rt->dst);
903         goto relookup;
904
905 out:
906         if (reachable) {
907                 reachable = 0;
908                 goto restart_2;
909         }
910         dst_hold(&rt->dst);
911         read_unlock_bh(&table->tb6_lock);
912 out2:
913         rt->dst.lastuse = jiffies;
914         rt->dst.__use++;
915
916         return rt;
917 }
918
919 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
920                                             struct flowi6 *fl6, int flags)
921 {
922         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
923 }
924
925 static struct dst_entry *ip6_route_input_lookup(struct net *net,
926                                                 struct net_device *dev,
927                                                 struct flowi6 *fl6, int flags)
928 {
929         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
930                 flags |= RT6_LOOKUP_F_IFACE;
931
932         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
933 }
934
935 void ip6_route_input(struct sk_buff *skb)
936 {
937         const struct ipv6hdr *iph = ipv6_hdr(skb);
938         struct net *net = dev_net(skb->dev);
939         int flags = RT6_LOOKUP_F_HAS_SADDR;
940         struct flowi6 fl6 = {
941                 .flowi6_iif = skb->dev->ifindex,
942                 .daddr = iph->daddr,
943                 .saddr = iph->saddr,
944                 .flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK,
945                 .flowi6_mark = skb->mark,
946                 .flowi6_proto = iph->nexthdr,
947         };
948
949         skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
950 }
951
952 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
953                                              struct flowi6 *fl6, int flags)
954 {
955         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
956 }
957
958 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
959                                     struct flowi6 *fl6)
960 {
961         int flags = 0;
962
963         fl6->flowi6_iif = LOOPBACK_IFINDEX;
964
965         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
966                 flags |= RT6_LOOKUP_F_IFACE;
967
968         if (!ipv6_addr_any(&fl6->saddr))
969                 flags |= RT6_LOOKUP_F_HAS_SADDR;
970         else if (sk)
971                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
972
973         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
974 }
975
976 EXPORT_SYMBOL(ip6_route_output);
977
978 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
979 {
980         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
981         struct dst_entry *new = NULL;
982
983         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0);
984         if (rt) {
985                 new = &rt->dst;
986
987                 memset(new + 1, 0, sizeof(*rt) - sizeof(*new));
988                 rt6_init_peer(rt, net->ipv6.peers);
989
990                 new->__use = 1;
991                 new->input = dst_discard;
992                 new->output = dst_discard;
993
994                 if (dst_metrics_read_only(&ort->dst))
995                         new->_metrics = ort->dst._metrics;
996                 else
997                         dst_copy_metrics(new, &ort->dst);
998                 rt->rt6i_idev = ort->rt6i_idev;
999                 if (rt->rt6i_idev)
1000                         in6_dev_hold(rt->rt6i_idev);
1001
1002                 rt->rt6i_gateway = ort->rt6i_gateway;
1003                 rt->rt6i_flags = ort->rt6i_flags;
1004                 rt6_clean_expires(rt);
1005                 rt->rt6i_metric = 0;
1006
1007                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1008 #ifdef CONFIG_IPV6_SUBTREES
1009                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1010 #endif
1011
1012                 dst_free(new);
1013         }
1014
1015         dst_release(dst_orig);
1016         return new ? new : ERR_PTR(-ENOMEM);
1017 }
1018
1019 /*
1020  *      Destination cache support functions
1021  */
1022
1023 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1024 {
1025         struct rt6_info *rt;
1026
1027         rt = (struct rt6_info *) dst;
1028
1029         if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
1030                 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
1031                         if (!rt6_has_peer(rt))
1032                                 rt6_bind_peer(rt, 0);
1033                         rt->rt6i_peer_genid = rt6_peer_genid();
1034                 }
1035                 return dst;
1036         }
1037         return NULL;
1038 }
1039
1040 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1041 {
1042         struct rt6_info *rt = (struct rt6_info *) dst;
1043
1044         if (rt) {
1045                 if (rt->rt6i_flags & RTF_CACHE) {
1046                         if (rt6_check_expired(rt)) {
1047                                 ip6_del_rt(rt);
1048                                 dst = NULL;
1049                         }
1050                 } else {
1051                         dst_release(dst);
1052                         dst = NULL;
1053                 }
1054         }
1055         return dst;
1056 }
1057
1058 static void ip6_link_failure(struct sk_buff *skb)
1059 {
1060         struct rt6_info *rt;
1061
1062         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1063
1064         rt = (struct rt6_info *) skb_dst(skb);
1065         if (rt) {
1066                 if (rt->rt6i_flags & RTF_CACHE)
1067                         rt6_update_expires(rt, 0);
1068                 else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
1069                         rt->rt6i_node->fn_sernum = -1;
1070         }
1071 }
1072
1073 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1074                                struct sk_buff *skb, u32 mtu)
1075 {
1076         struct rt6_info *rt6 = (struct rt6_info*)dst;
1077
1078         dst_confirm(dst);
1079         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1080                 struct net *net = dev_net(dst->dev);
1081
1082                 rt6->rt6i_flags |= RTF_MODIFIED;
1083                 if (mtu < IPV6_MIN_MTU) {
1084                         u32 features = dst_metric(dst, RTAX_FEATURES);
1085                         mtu = IPV6_MIN_MTU;
1086                         features |= RTAX_FEATURE_ALLFRAG;
1087                         dst_metric_set(dst, RTAX_FEATURES, features);
1088                 }
1089                 dst_metric_set(dst, RTAX_MTU, mtu);
1090                 rt6_update_expires(rt6, net->ipv6.sysctl.ip6_rt_mtu_expires);
1091         }
1092 }
1093
1094 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1095                      int oif, u32 mark)
1096 {
1097         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1098         struct dst_entry *dst;
1099         struct flowi6 fl6;
1100
1101         memset(&fl6, 0, sizeof(fl6));
1102         fl6.flowi6_oif = oif;
1103         fl6.flowi6_mark = mark;
1104         fl6.flowi6_flags = 0;
1105         fl6.daddr = iph->daddr;
1106         fl6.saddr = iph->saddr;
1107         fl6.flowlabel = (*(__be32 *) iph) & IPV6_FLOWINFO_MASK;
1108
1109         dst = ip6_route_output(net, NULL, &fl6);
1110         if (!dst->error)
1111                 ip6_rt_update_pmtu(dst, NULL, skb, ntohl(mtu));
1112         dst_release(dst);
1113 }
1114 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1115
1116 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1117 {
1118         ip6_update_pmtu(skb, sock_net(sk), mtu,
1119                         sk->sk_bound_dev_if, sk->sk_mark);
1120 }
1121 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1122
1123 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark)
1124 {
1125         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1126         struct dst_entry *dst;
1127         struct flowi6 fl6;
1128
1129         memset(&fl6, 0, sizeof(fl6));
1130         fl6.flowi6_oif = oif;
1131         fl6.flowi6_mark = mark;
1132         fl6.flowi6_flags = 0;
1133         fl6.daddr = iph->daddr;
1134         fl6.saddr = iph->saddr;
1135         fl6.flowlabel = (*(__be32 *) iph) & IPV6_FLOWINFO_MASK;
1136
1137         dst = ip6_route_output(net, NULL, &fl6);
1138         if (!dst->error)
1139                 rt6_do_redirect(dst, NULL, skb);
1140         dst_release(dst);
1141 }
1142 EXPORT_SYMBOL_GPL(ip6_redirect);
1143
1144 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
1145 {
1146         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark);
1147 }
1148 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
1149
1150 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1151 {
1152         struct net_device *dev = dst->dev;
1153         unsigned int mtu = dst_mtu(dst);
1154         struct net *net = dev_net(dev);
1155
1156         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1157
1158         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1159                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1160
1161         /*
1162          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1163          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1164          * IPV6_MAXPLEN is also valid and means: "any MSS,
1165          * rely only on pmtu discovery"
1166          */
1167         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1168                 mtu = IPV6_MAXPLEN;
1169         return mtu;
1170 }
1171
1172 static unsigned int ip6_mtu(const struct dst_entry *dst)
1173 {
1174         struct inet6_dev *idev;
1175         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1176
1177         if (mtu)
1178                 return mtu;
1179
1180         mtu = IPV6_MIN_MTU;
1181
1182         rcu_read_lock();
1183         idev = __in6_dev_get(dst->dev);
1184         if (idev)
1185                 mtu = idev->cnf.mtu6;
1186         rcu_read_unlock();
1187
1188         return mtu;
1189 }
1190
1191 static struct dst_entry *icmp6_dst_gc_list;
1192 static DEFINE_SPINLOCK(icmp6_dst_lock);
1193
1194 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1195                                   struct neighbour *neigh,
1196                                   struct flowi6 *fl6)
1197 {
1198         struct dst_entry *dst;
1199         struct rt6_info *rt;
1200         struct inet6_dev *idev = in6_dev_get(dev);
1201         struct net *net = dev_net(dev);
1202
1203         if (unlikely(!idev))
1204                 return ERR_PTR(-ENODEV);
1205
1206         rt = ip6_dst_alloc(net, dev, 0, NULL);
1207         if (unlikely(!rt)) {
1208                 in6_dev_put(idev);
1209                 dst = ERR_PTR(-ENOMEM);
1210                 goto out;
1211         }
1212
1213         if (neigh)
1214                 neigh_hold(neigh);
1215         else {
1216                 neigh = ip6_neigh_lookup(&rt->dst, NULL, &fl6->daddr);
1217                 if (IS_ERR(neigh)) {
1218                         in6_dev_put(idev);
1219                         dst_free(&rt->dst);
1220                         return ERR_CAST(neigh);
1221                 }
1222         }
1223
1224         rt->dst.flags |= DST_HOST;
1225         rt->dst.output  = ip6_output;
1226         rt->n = neigh;
1227         atomic_set(&rt->dst.__refcnt, 1);
1228         rt->rt6i_dst.addr = fl6->daddr;
1229         rt->rt6i_dst.plen = 128;
1230         rt->rt6i_idev     = idev;
1231         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1232
1233         spin_lock_bh(&icmp6_dst_lock);
1234         rt->dst.next = icmp6_dst_gc_list;
1235         icmp6_dst_gc_list = &rt->dst;
1236         spin_unlock_bh(&icmp6_dst_lock);
1237
1238         fib6_force_start_gc(net);
1239
1240         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1241
1242 out:
1243         return dst;
1244 }
1245
1246 int icmp6_dst_gc(void)
1247 {
1248         struct dst_entry *dst, **pprev;
1249         int more = 0;
1250
1251         spin_lock_bh(&icmp6_dst_lock);
1252         pprev = &icmp6_dst_gc_list;
1253
1254         while ((dst = *pprev) != NULL) {
1255                 if (!atomic_read(&dst->__refcnt)) {
1256                         *pprev = dst->next;
1257                         dst_free(dst);
1258                 } else {
1259                         pprev = &dst->next;
1260                         ++more;
1261                 }
1262         }
1263
1264         spin_unlock_bh(&icmp6_dst_lock);
1265
1266         return more;
1267 }
1268
1269 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1270                             void *arg)
1271 {
1272         struct dst_entry *dst, **pprev;
1273
1274         spin_lock_bh(&icmp6_dst_lock);
1275         pprev = &icmp6_dst_gc_list;
1276         while ((dst = *pprev) != NULL) {
1277                 struct rt6_info *rt = (struct rt6_info *) dst;
1278                 if (func(rt, arg)) {
1279                         *pprev = dst->next;
1280                         dst_free(dst);
1281                 } else {
1282                         pprev = &dst->next;
1283                 }
1284         }
1285         spin_unlock_bh(&icmp6_dst_lock);
1286 }
1287
1288 static int ip6_dst_gc(struct dst_ops *ops)
1289 {
1290         unsigned long now = jiffies;
1291         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1292         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1293         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1294         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1295         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1296         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1297         int entries;
1298
1299         entries = dst_entries_get_fast(ops);
1300         if (time_after(rt_last_gc + rt_min_interval, now) &&
1301             entries <= rt_max_size)
1302                 goto out;
1303
1304         net->ipv6.ip6_rt_gc_expire++;
1305         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1306         net->ipv6.ip6_rt_last_gc = now;
1307         entries = dst_entries_get_slow(ops);
1308         if (entries < ops->gc_thresh)
1309                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1310 out:
1311         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1312         return entries > rt_max_size;
1313 }
1314
1315 /* Clean host part of a prefix. Not necessary in radix tree,
1316    but results in cleaner routing tables.
1317
1318    Remove it only when all the things will work!
1319  */
1320
1321 int ip6_dst_hoplimit(struct dst_entry *dst)
1322 {
1323         int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1324         if (hoplimit == 0) {
1325                 struct net_device *dev = dst->dev;
1326                 struct inet6_dev *idev;
1327
1328                 rcu_read_lock();
1329                 idev = __in6_dev_get(dev);
1330                 if (idev)
1331                         hoplimit = idev->cnf.hop_limit;
1332                 else
1333                         hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1334                 rcu_read_unlock();
1335         }
1336         return hoplimit;
1337 }
1338 EXPORT_SYMBOL(ip6_dst_hoplimit);
1339
1340 /*
1341  *
1342  */
1343
1344 int ip6_route_add(struct fib6_config *cfg)
1345 {
1346         int err;
1347         struct net *net = cfg->fc_nlinfo.nl_net;
1348         struct rt6_info *rt = NULL;
1349         struct net_device *dev = NULL;
1350         struct inet6_dev *idev = NULL;
1351         struct fib6_table *table;
1352         int addr_type;
1353
1354         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1355                 return -EINVAL;
1356 #ifndef CONFIG_IPV6_SUBTREES
1357         if (cfg->fc_src_len)
1358                 return -EINVAL;
1359 #endif
1360         if (cfg->fc_ifindex) {
1361                 err = -ENODEV;
1362                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1363                 if (!dev)
1364                         goto out;
1365                 idev = in6_dev_get(dev);
1366                 if (!idev)
1367                         goto out;
1368         }
1369
1370         if (cfg->fc_metric == 0)
1371                 cfg->fc_metric = IP6_RT_PRIO_USER;
1372
1373         err = -ENOBUFS;
1374         if (cfg->fc_nlinfo.nlh &&
1375             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1376                 table = fib6_get_table(net, cfg->fc_table);
1377                 if (!table) {
1378                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1379                         table = fib6_new_table(net, cfg->fc_table);
1380                 }
1381         } else {
1382                 table = fib6_new_table(net, cfg->fc_table);
1383         }
1384
1385         if (!table)
1386                 goto out;
1387
1388         rt = ip6_dst_alloc(net, NULL, DST_NOCOUNT, table);
1389
1390         if (!rt) {
1391                 err = -ENOMEM;
1392                 goto out;
1393         }
1394
1395         rt->dst.obsolete = -1;
1396
1397         if (cfg->fc_flags & RTF_EXPIRES)
1398                 rt6_set_expires(rt, jiffies +
1399                                 clock_t_to_jiffies(cfg->fc_expires));
1400         else
1401                 rt6_clean_expires(rt);
1402
1403         if (cfg->fc_protocol == RTPROT_UNSPEC)
1404                 cfg->fc_protocol = RTPROT_BOOT;
1405         rt->rt6i_protocol = cfg->fc_protocol;
1406
1407         addr_type = ipv6_addr_type(&cfg->fc_dst);
1408
1409         if (addr_type & IPV6_ADDR_MULTICAST)
1410                 rt->dst.input = ip6_mc_input;
1411         else if (cfg->fc_flags & RTF_LOCAL)
1412                 rt->dst.input = ip6_input;
1413         else
1414                 rt->dst.input = ip6_forward;
1415
1416         rt->dst.output = ip6_output;
1417
1418         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1419         rt->rt6i_dst.plen = cfg->fc_dst_len;
1420         if (rt->rt6i_dst.plen == 128)
1421                rt->dst.flags |= DST_HOST;
1422
1423         if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1424                 u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1425                 if (!metrics) {
1426                         err = -ENOMEM;
1427                         goto out;
1428                 }
1429                 dst_init_metrics(&rt->dst, metrics, 0);
1430         }
1431 #ifdef CONFIG_IPV6_SUBTREES
1432         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1433         rt->rt6i_src.plen = cfg->fc_src_len;
1434 #endif
1435
1436         rt->rt6i_metric = cfg->fc_metric;
1437
1438         /* We cannot add true routes via loopback here,
1439            they would result in kernel looping; promote them to reject routes
1440          */
1441         if ((cfg->fc_flags & RTF_REJECT) ||
1442             (dev && (dev->flags & IFF_LOOPBACK) &&
1443              !(addr_type & IPV6_ADDR_LOOPBACK) &&
1444              !(cfg->fc_flags & RTF_LOCAL))) {
1445                 /* hold loopback dev/idev if we haven't done so. */
1446                 if (dev != net->loopback_dev) {
1447                         if (dev) {
1448                                 dev_put(dev);
1449                                 in6_dev_put(idev);
1450                         }
1451                         dev = net->loopback_dev;
1452                         dev_hold(dev);
1453                         idev = in6_dev_get(dev);
1454                         if (!idev) {
1455                                 err = -ENODEV;
1456                                 goto out;
1457                         }
1458                 }
1459                 rt->dst.output = ip6_pkt_discard_out;
1460                 rt->dst.input = ip6_pkt_discard;
1461                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1462                 switch (cfg->fc_type) {
1463                 case RTN_BLACKHOLE:
1464                         rt->dst.error = -EINVAL;
1465                         break;
1466                 case RTN_PROHIBIT:
1467                         rt->dst.error = -EACCES;
1468                         break;
1469                 case RTN_THROW:
1470                         rt->dst.error = -EAGAIN;
1471                         break;
1472                 default:
1473                         rt->dst.error = -ENETUNREACH;
1474                         break;
1475                 }
1476                 goto install_route;
1477         }
1478
1479         if (cfg->fc_flags & RTF_GATEWAY) {
1480                 const struct in6_addr *gw_addr;
1481                 int gwa_type;
1482
1483                 gw_addr = &cfg->fc_gateway;
1484                 rt->rt6i_gateway = *gw_addr;
1485                 gwa_type = ipv6_addr_type(gw_addr);
1486
1487                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1488                         struct rt6_info *grt;
1489
1490                         /* IPv6 strictly inhibits using not link-local
1491                            addresses as nexthop address.
1492                            Otherwise, router will not able to send redirects.
1493                            It is very good, but in some (rare!) circumstances
1494                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1495                            some exceptions. --ANK
1496                          */
1497                         err = -EINVAL;
1498                         if (!(gwa_type & IPV6_ADDR_UNICAST))
1499                                 goto out;
1500
1501                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1502
1503                         err = -EHOSTUNREACH;
1504                         if (!grt)
1505                                 goto out;
1506                         if (dev) {
1507                                 if (dev != grt->dst.dev) {
1508                                         dst_release(&grt->dst);
1509                                         goto out;
1510                                 }
1511                         } else {
1512                                 dev = grt->dst.dev;
1513                                 idev = grt->rt6i_idev;
1514                                 dev_hold(dev);
1515                                 in6_dev_hold(grt->rt6i_idev);
1516                         }
1517                         if (!(grt->rt6i_flags & RTF_GATEWAY))
1518                                 err = 0;
1519                         dst_release(&grt->dst);
1520
1521                         if (err)
1522                                 goto out;
1523                 }
1524                 err = -EINVAL;
1525                 if (!dev || (dev->flags & IFF_LOOPBACK))
1526                         goto out;
1527         }
1528
1529         err = -ENODEV;
1530         if (!dev)
1531                 goto out;
1532
1533         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1534                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1535                         err = -EINVAL;
1536                         goto out;
1537                 }
1538                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1539                 rt->rt6i_prefsrc.plen = 128;
1540         } else
1541                 rt->rt6i_prefsrc.plen = 0;
1542
1543         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1544                 err = rt6_bind_neighbour(rt, dev);
1545                 if (err)
1546                         goto out;
1547         }
1548
1549         rt->rt6i_flags = cfg->fc_flags;
1550
1551 install_route:
1552         if (cfg->fc_mx) {
1553                 struct nlattr *nla;
1554                 int remaining;
1555
1556                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1557                         int type = nla_type(nla);
1558
1559                         if (type) {
1560                                 if (type > RTAX_MAX) {
1561                                         err = -EINVAL;
1562                                         goto out;
1563                                 }
1564
1565                                 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1566                         }
1567                 }
1568         }
1569
1570         rt->dst.dev = dev;
1571         rt->rt6i_idev = idev;
1572         rt->rt6i_table = table;
1573
1574         cfg->fc_nlinfo.nl_net = dev_net(dev);
1575
1576         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1577
1578 out:
1579         if (dev)
1580                 dev_put(dev);
1581         if (idev)
1582                 in6_dev_put(idev);
1583         if (rt)
1584                 dst_free(&rt->dst);
1585         return err;
1586 }
1587
1588 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1589 {
1590         int err;
1591         struct fib6_table *table;
1592         struct net *net = dev_net(rt->dst.dev);
1593
1594         if (rt == net->ipv6.ip6_null_entry)
1595                 return -ENOENT;
1596
1597         table = rt->rt6i_table;
1598         write_lock_bh(&table->tb6_lock);
1599
1600         err = fib6_del(rt, info);
1601         dst_release(&rt->dst);
1602
1603         write_unlock_bh(&table->tb6_lock);
1604
1605         return err;
1606 }
1607
1608 int ip6_del_rt(struct rt6_info *rt)
1609 {
1610         struct nl_info info = {
1611                 .nl_net = dev_net(rt->dst.dev),
1612         };
1613         return __ip6_del_rt(rt, &info);
1614 }
1615
1616 static int ip6_route_del(struct fib6_config *cfg)
1617 {
1618         struct fib6_table *table;
1619         struct fib6_node *fn;
1620         struct rt6_info *rt;
1621         int err = -ESRCH;
1622
1623         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1624         if (!table)
1625                 return err;
1626
1627         read_lock_bh(&table->tb6_lock);
1628
1629         fn = fib6_locate(&table->tb6_root,
1630                          &cfg->fc_dst, cfg->fc_dst_len,
1631                          &cfg->fc_src, cfg->fc_src_len);
1632
1633         if (fn) {
1634                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1635                         if (cfg->fc_ifindex &&
1636                             (!rt->dst.dev ||
1637                              rt->dst.dev->ifindex != cfg->fc_ifindex))
1638                                 continue;
1639                         if (cfg->fc_flags & RTF_GATEWAY &&
1640                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1641                                 continue;
1642                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1643                                 continue;
1644                         dst_hold(&rt->dst);
1645                         read_unlock_bh(&table->tb6_lock);
1646
1647                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1648                 }
1649         }
1650         read_unlock_bh(&table->tb6_lock);
1651
1652         return err;
1653 }
1654
1655 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
1656 {
1657         struct net *net = dev_net(skb->dev);
1658         struct netevent_redirect netevent;
1659         struct rt6_info *rt, *nrt = NULL;
1660         const struct in6_addr *target;
1661         struct ndisc_options ndopts;
1662         const struct in6_addr *dest;
1663         struct neighbour *old_neigh;
1664         struct inet6_dev *in6_dev;
1665         struct neighbour *neigh;
1666         struct icmp6hdr *icmph;
1667         int optlen, on_link;
1668         u8 *lladdr;
1669
1670         optlen = skb->tail - skb->transport_header;
1671         optlen -= sizeof(struct icmp6hdr) + 2 * sizeof(struct in6_addr);
1672
1673         if (optlen < 0) {
1674                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
1675                 return;
1676         }
1677
1678         icmph = icmp6_hdr(skb);
1679         target = (const struct in6_addr *) (icmph + 1);
1680         dest = target + 1;
1681
1682         if (ipv6_addr_is_multicast(dest)) {
1683                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
1684                 return;
1685         }
1686
1687         on_link = 0;
1688         if (ipv6_addr_equal(dest, target)) {
1689                 on_link = 1;
1690         } else if (ipv6_addr_type(target) !=
1691                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
1692                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
1693                 return;
1694         }
1695
1696         in6_dev = __in6_dev_get(skb->dev);
1697         if (!in6_dev)
1698                 return;
1699         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
1700                 return;
1701
1702         /* RFC2461 8.1:
1703          *      The IP source address of the Redirect MUST be the same as the current
1704          *      first-hop router for the specified ICMP Destination Address.
1705          */
1706
1707         if (!ndisc_parse_options((u8*)(dest + 1), optlen, &ndopts)) {
1708                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
1709                 return;
1710         }
1711
1712         lladdr = NULL;
1713         if (ndopts.nd_opts_tgt_lladdr) {
1714                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
1715                                              skb->dev);
1716                 if (!lladdr) {
1717                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
1718                         return;
1719                 }
1720         }
1721
1722         rt = (struct rt6_info *) dst;
1723         if (rt == net->ipv6.ip6_null_entry) {
1724                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
1725                 return;
1726         }
1727
1728         /* Redirect received -> path was valid.
1729          * Look, redirects are sent only in response to data packets,
1730          * so that this nexthop apparently is reachable. --ANK
1731          */
1732         dst_confirm(&rt->dst);
1733
1734         neigh = __neigh_lookup(&nd_tbl, target, skb->dev, 1);
1735         if (!neigh)
1736                 return;
1737
1738         /* Duplicate redirect: silently ignore. */
1739         old_neigh = rt->n;
1740         if (neigh == old_neigh)
1741                 goto out;
1742
1743         /*
1744          *      We have finally decided to accept it.
1745          */
1746
1747         neigh_update(neigh, lladdr, NUD_STALE,
1748                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1749                      NEIGH_UPDATE_F_OVERRIDE|
1750                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1751                                      NEIGH_UPDATE_F_ISROUTER))
1752                      );
1753
1754         nrt = ip6_rt_copy(rt, dest);
1755         if (!nrt)
1756                 goto out;
1757
1758         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1759         if (on_link)
1760                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1761
1762         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
1763         nrt->n = neigh_clone(neigh);
1764
1765         if (ip6_ins_rt(nrt))
1766                 goto out;
1767
1768         netevent.old = &rt->dst;
1769         netevent.old_neigh = old_neigh;
1770         netevent.new = &nrt->dst;
1771         netevent.new_neigh = neigh;
1772         netevent.daddr = dest;
1773         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1774
1775         if (rt->rt6i_flags & RTF_CACHE) {
1776                 rt = (struct rt6_info *) dst_clone(&rt->dst);
1777                 ip6_del_rt(rt);
1778         }
1779
1780 out:
1781         neigh_release(neigh);
1782 }
1783
1784 /*
1785  *      Misc support functions
1786  */
1787
1788 static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
1789                                     const struct in6_addr *dest)
1790 {
1791         struct net *net = dev_net(ort->dst.dev);
1792         struct rt6_info *rt = ip6_dst_alloc(net, ort->dst.dev, 0,
1793                                             ort->rt6i_table);
1794
1795         if (rt) {
1796                 rt->dst.input = ort->dst.input;
1797                 rt->dst.output = ort->dst.output;
1798                 rt->dst.flags |= DST_HOST;
1799
1800                 rt->rt6i_dst.addr = *dest;
1801                 rt->rt6i_dst.plen = 128;
1802                 dst_copy_metrics(&rt->dst, &ort->dst);
1803                 rt->dst.error = ort->dst.error;
1804                 rt->rt6i_idev = ort->rt6i_idev;
1805                 if (rt->rt6i_idev)
1806                         in6_dev_hold(rt->rt6i_idev);
1807                 rt->dst.lastuse = jiffies;
1808
1809                 rt->rt6i_gateway = ort->rt6i_gateway;
1810                 rt->rt6i_flags = ort->rt6i_flags;
1811                 if ((ort->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) ==
1812                     (RTF_DEFAULT | RTF_ADDRCONF))
1813                         rt6_set_from(rt, ort);
1814                 else
1815                         rt6_clean_expires(rt);
1816                 rt->rt6i_metric = 0;
1817
1818 #ifdef CONFIG_IPV6_SUBTREES
1819                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1820 #endif
1821                 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1822                 rt->rt6i_table = ort->rt6i_table;
1823         }
1824         return rt;
1825 }
1826
1827 #ifdef CONFIG_IPV6_ROUTE_INFO
1828 static struct rt6_info *rt6_get_route_info(struct net *net,
1829                                            const struct in6_addr *prefix, int prefixlen,
1830                                            const struct in6_addr *gwaddr, int ifindex)
1831 {
1832         struct fib6_node *fn;
1833         struct rt6_info *rt = NULL;
1834         struct fib6_table *table;
1835
1836         table = fib6_get_table(net, RT6_TABLE_INFO);
1837         if (!table)
1838                 return NULL;
1839
1840         write_lock_bh(&table->tb6_lock);
1841         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1842         if (!fn)
1843                 goto out;
1844
1845         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1846                 if (rt->dst.dev->ifindex != ifindex)
1847                         continue;
1848                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1849                         continue;
1850                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1851                         continue;
1852                 dst_hold(&rt->dst);
1853                 break;
1854         }
1855 out:
1856         write_unlock_bh(&table->tb6_lock);
1857         return rt;
1858 }
1859
1860 static struct rt6_info *rt6_add_route_info(struct net *net,
1861                                            const struct in6_addr *prefix, int prefixlen,
1862                                            const struct in6_addr *gwaddr, int ifindex,
1863                                            unsigned int pref)
1864 {
1865         struct fib6_config cfg = {
1866                 .fc_table       = RT6_TABLE_INFO,
1867                 .fc_metric      = IP6_RT_PRIO_USER,
1868                 .fc_ifindex     = ifindex,
1869                 .fc_dst_len     = prefixlen,
1870                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1871                                   RTF_UP | RTF_PREF(pref),
1872                 .fc_nlinfo.portid = 0,
1873                 .fc_nlinfo.nlh = NULL,
1874                 .fc_nlinfo.nl_net = net,
1875         };
1876
1877         cfg.fc_dst = *prefix;
1878         cfg.fc_gateway = *gwaddr;
1879
1880         /* We should treat it as a default route if prefix length is 0. */
1881         if (!prefixlen)
1882                 cfg.fc_flags |= RTF_DEFAULT;
1883
1884         ip6_route_add(&cfg);
1885
1886         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1887 }
1888 #endif
1889
1890 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1891 {
1892         struct rt6_info *rt;
1893         struct fib6_table *table;
1894
1895         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1896         if (!table)
1897                 return NULL;
1898
1899         write_lock_bh(&table->tb6_lock);
1900         for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1901                 if (dev == rt->dst.dev &&
1902                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1903                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1904                         break;
1905         }
1906         if (rt)
1907                 dst_hold(&rt->dst);
1908         write_unlock_bh(&table->tb6_lock);
1909         return rt;
1910 }
1911
1912 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1913                                      struct net_device *dev,
1914                                      unsigned int pref)
1915 {
1916         struct fib6_config cfg = {
1917                 .fc_table       = RT6_TABLE_DFLT,
1918                 .fc_metric      = IP6_RT_PRIO_USER,
1919                 .fc_ifindex     = dev->ifindex,
1920                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1921                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1922                 .fc_nlinfo.portid = 0,
1923                 .fc_nlinfo.nlh = NULL,
1924                 .fc_nlinfo.nl_net = dev_net(dev),
1925         };
1926
1927         cfg.fc_gateway = *gwaddr;
1928
1929         ip6_route_add(&cfg);
1930
1931         return rt6_get_dflt_router(gwaddr, dev);
1932 }
1933
1934 void rt6_purge_dflt_routers(struct net *net)
1935 {
1936         struct rt6_info *rt;
1937         struct fib6_table *table;
1938
1939         /* NOTE: Keep consistent with rt6_get_dflt_router */
1940         table = fib6_get_table(net, RT6_TABLE_DFLT);
1941         if (!table)
1942                 return;
1943
1944 restart:
1945         read_lock_bh(&table->tb6_lock);
1946         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1947                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1948                         dst_hold(&rt->dst);
1949                         read_unlock_bh(&table->tb6_lock);
1950                         ip6_del_rt(rt);
1951                         goto restart;
1952                 }
1953         }
1954         read_unlock_bh(&table->tb6_lock);
1955 }
1956
1957 static void rtmsg_to_fib6_config(struct net *net,
1958                                  struct in6_rtmsg *rtmsg,
1959                                  struct fib6_config *cfg)
1960 {
1961         memset(cfg, 0, sizeof(*cfg));
1962
1963         cfg->fc_table = RT6_TABLE_MAIN;
1964         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1965         cfg->fc_metric = rtmsg->rtmsg_metric;
1966         cfg->fc_expires = rtmsg->rtmsg_info;
1967         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1968         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1969         cfg->fc_flags = rtmsg->rtmsg_flags;
1970
1971         cfg->fc_nlinfo.nl_net = net;
1972
1973         cfg->fc_dst = rtmsg->rtmsg_dst;
1974         cfg->fc_src = rtmsg->rtmsg_src;
1975         cfg->fc_gateway = rtmsg->rtmsg_gateway;
1976 }
1977
1978 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1979 {
1980         struct fib6_config cfg;
1981         struct in6_rtmsg rtmsg;
1982         int err;
1983
1984         switch(cmd) {
1985         case SIOCADDRT:         /* Add a route */
1986         case SIOCDELRT:         /* Delete a route */
1987                 if (!capable(CAP_NET_ADMIN))
1988                         return -EPERM;
1989                 err = copy_from_user(&rtmsg, arg,
1990                                      sizeof(struct in6_rtmsg));
1991                 if (err)
1992                         return -EFAULT;
1993
1994                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1995
1996                 rtnl_lock();
1997                 switch (cmd) {
1998                 case SIOCADDRT:
1999                         err = ip6_route_add(&cfg);
2000                         break;
2001                 case SIOCDELRT:
2002                         err = ip6_route_del(&cfg);
2003                         break;
2004                 default:
2005                         err = -EINVAL;
2006                 }
2007                 rtnl_unlock();
2008
2009                 return err;
2010         }
2011
2012         return -EINVAL;
2013 }
2014
2015 /*
2016  *      Drop the packet on the floor
2017  */
2018
2019 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2020 {
2021         int type;
2022         struct dst_entry *dst = skb_dst(skb);
2023         switch (ipstats_mib_noroutes) {
2024         case IPSTATS_MIB_INNOROUTES:
2025                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2026                 if (type == IPV6_ADDR_ANY) {
2027                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2028                                       IPSTATS_MIB_INADDRERRORS);
2029                         break;
2030                 }
2031                 /* FALLTHROUGH */
2032         case IPSTATS_MIB_OUTNOROUTES:
2033                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2034                               ipstats_mib_noroutes);
2035                 break;
2036         }
2037         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2038         kfree_skb(skb);
2039         return 0;
2040 }
2041
2042 static int ip6_pkt_discard(struct sk_buff *skb)
2043 {
2044         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2045 }
2046
2047 static int ip6_pkt_discard_out(struct sk_buff *skb)
2048 {
2049         skb->dev = skb_dst(skb)->dev;
2050         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2051 }
2052
2053 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2054
2055 static int ip6_pkt_prohibit(struct sk_buff *skb)
2056 {
2057         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2058 }
2059
2060 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2061 {
2062         skb->dev = skb_dst(skb)->dev;
2063         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2064 }
2065
2066 #endif
2067
2068 /*
2069  *      Allocate a dst for local (unicast / anycast) address.
2070  */
2071
2072 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2073                                     const struct in6_addr *addr,
2074                                     bool anycast)
2075 {
2076         struct net *net = dev_net(idev->dev);
2077         struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev, 0, NULL);
2078         int err;
2079
2080         if (!rt) {
2081                 net_warn_ratelimited("Maximum number of routes reached, consider increasing route/max_size\n");
2082                 return ERR_PTR(-ENOMEM);
2083         }
2084
2085         in6_dev_hold(idev);
2086
2087         rt->dst.flags |= DST_HOST;
2088         rt->dst.input = ip6_input;
2089         rt->dst.output = ip6_output;
2090         rt->rt6i_idev = idev;
2091         rt->dst.obsolete = -1;
2092
2093         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2094         if (anycast)
2095                 rt->rt6i_flags |= RTF_ANYCAST;
2096         else
2097                 rt->rt6i_flags |= RTF_LOCAL;
2098         err = rt6_bind_neighbour(rt, rt->dst.dev);
2099         if (err) {
2100                 dst_free(&rt->dst);
2101                 return ERR_PTR(err);
2102         }
2103
2104         rt->rt6i_dst.addr = *addr;
2105         rt->rt6i_dst.plen = 128;
2106         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2107
2108         atomic_set(&rt->dst.__refcnt, 1);
2109
2110         return rt;
2111 }
2112
2113 int ip6_route_get_saddr(struct net *net,
2114                         struct rt6_info *rt,
2115                         const struct in6_addr *daddr,
2116                         unsigned int prefs,
2117                         struct in6_addr *saddr)
2118 {
2119         struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2120         int err = 0;
2121         if (rt->rt6i_prefsrc.plen)
2122                 *saddr = rt->rt6i_prefsrc.addr;
2123         else
2124                 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2125                                          daddr, prefs, saddr);
2126         return err;
2127 }
2128
2129 /* remove deleted ip from prefsrc entries */
2130 struct arg_dev_net_ip {
2131         struct net_device *dev;
2132         struct net *net;
2133         struct in6_addr *addr;
2134 };
2135
2136 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2137 {
2138         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2139         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2140         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2141
2142         if (((void *)rt->dst.dev == dev || !dev) &&
2143             rt != net->ipv6.ip6_null_entry &&
2144             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2145                 /* remove prefsrc entry */
2146                 rt->rt6i_prefsrc.plen = 0;
2147         }
2148         return 0;
2149 }
2150
2151 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2152 {
2153         struct net *net = dev_net(ifp->idev->dev);
2154         struct arg_dev_net_ip adni = {
2155                 .dev = ifp->idev->dev,
2156                 .net = net,
2157                 .addr = &ifp->addr,
2158         };
2159         fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2160 }
2161
2162 struct arg_dev_net {
2163         struct net_device *dev;
2164         struct net *net;
2165 };
2166
2167 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2168 {
2169         const struct arg_dev_net *adn = arg;
2170         const struct net_device *dev = adn->dev;
2171
2172         if ((rt->dst.dev == dev || !dev) &&
2173             rt != adn->net->ipv6.ip6_null_entry)
2174                 return -1;
2175
2176         return 0;
2177 }
2178
2179 void rt6_ifdown(struct net *net, struct net_device *dev)
2180 {
2181         struct arg_dev_net adn = {
2182                 .dev = dev,
2183                 .net = net,
2184         };
2185
2186         fib6_clean_all(net, fib6_ifdown, 0, &adn);
2187         icmp6_clean_all(fib6_ifdown, &adn);
2188 }
2189
2190 struct rt6_mtu_change_arg {
2191         struct net_device *dev;
2192         unsigned int mtu;
2193 };
2194
2195 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2196 {
2197         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2198         struct inet6_dev *idev;
2199
2200         /* In IPv6 pmtu discovery is not optional,
2201            so that RTAX_MTU lock cannot disable it.
2202            We still use this lock to block changes
2203            caused by addrconf/ndisc.
2204         */
2205
2206         idev = __in6_dev_get(arg->dev);
2207         if (!idev)
2208                 return 0;
2209
2210         /* For administrative MTU increase, there is no way to discover
2211            IPv6 PMTU increase, so PMTU increase should be updated here.
2212            Since RFC 1981 doesn't include administrative MTU increase
2213            update PMTU increase is a MUST. (i.e. jumbo frame)
2214          */
2215         /*
2216            If new MTU is less than route PMTU, this new MTU will be the
2217            lowest MTU in the path, update the route PMTU to reflect PMTU
2218            decreases; if new MTU is greater than route PMTU, and the
2219            old MTU is the lowest MTU in the path, update the route PMTU
2220            to reflect the increase. In this case if the other nodes' MTU
2221            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2222            PMTU discouvery.
2223          */
2224         if (rt->dst.dev == arg->dev &&
2225             !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2226             (dst_mtu(&rt->dst) >= arg->mtu ||
2227              (dst_mtu(&rt->dst) < arg->mtu &&
2228               dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2229                 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2230         }
2231         return 0;
2232 }
2233
2234 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2235 {
2236         struct rt6_mtu_change_arg arg = {
2237                 .dev = dev,
2238                 .mtu = mtu,
2239         };
2240
2241         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2242 }
2243
2244 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2245         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2246         [RTA_OIF]               = { .type = NLA_U32 },
2247         [RTA_IIF]               = { .type = NLA_U32 },
2248         [RTA_PRIORITY]          = { .type = NLA_U32 },
2249         [RTA_METRICS]           = { .type = NLA_NESTED },
2250 };
2251
2252 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2253                               struct fib6_config *cfg)
2254 {
2255         struct rtmsg *rtm;
2256         struct nlattr *tb[RTA_MAX+1];
2257         int err;
2258
2259         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2260         if (err < 0)
2261                 goto errout;
2262
2263         err = -EINVAL;
2264         rtm = nlmsg_data(nlh);
2265         memset(cfg, 0, sizeof(*cfg));
2266
2267         cfg->fc_table = rtm->rtm_table;
2268         cfg->fc_dst_len = rtm->rtm_dst_len;
2269         cfg->fc_src_len = rtm->rtm_src_len;
2270         cfg->fc_flags = RTF_UP;
2271         cfg->fc_protocol = rtm->rtm_protocol;
2272         cfg->fc_type = rtm->rtm_type;
2273
2274         if (rtm->rtm_type == RTN_UNREACHABLE ||
2275             rtm->rtm_type == RTN_BLACKHOLE ||
2276             rtm->rtm_type == RTN_PROHIBIT ||
2277             rtm->rtm_type == RTN_THROW)
2278                 cfg->fc_flags |= RTF_REJECT;
2279
2280         if (rtm->rtm_type == RTN_LOCAL)
2281                 cfg->fc_flags |= RTF_LOCAL;
2282
2283         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
2284         cfg->fc_nlinfo.nlh = nlh;
2285         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2286
2287         if (tb[RTA_GATEWAY]) {
2288                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2289                 cfg->fc_flags |= RTF_GATEWAY;
2290         }
2291
2292         if (tb[RTA_DST]) {
2293                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2294
2295                 if (nla_len(tb[RTA_DST]) < plen)
2296                         goto errout;
2297
2298                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2299         }
2300
2301         if (tb[RTA_SRC]) {
2302                 int plen = (rtm->rtm_src_len + 7) >> 3;
2303
2304                 if (nla_len(tb[RTA_SRC]) < plen)
2305                         goto errout;
2306
2307                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2308         }
2309
2310         if (tb[RTA_PREFSRC])
2311                 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2312
2313         if (tb[RTA_OIF])
2314                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2315
2316         if (tb[RTA_PRIORITY])
2317                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2318
2319         if (tb[RTA_METRICS]) {
2320                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2321                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2322         }
2323
2324         if (tb[RTA_TABLE])
2325                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2326
2327         err = 0;
2328 errout:
2329         return err;
2330 }
2331
2332 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2333 {
2334         struct fib6_config cfg;
2335         int err;
2336
2337         err = rtm_to_fib6_config(skb, nlh, &cfg);
2338         if (err < 0)
2339                 return err;
2340
2341         return ip6_route_del(&cfg);
2342 }
2343
2344 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2345 {
2346         struct fib6_config cfg;
2347         int err;
2348
2349         err = rtm_to_fib6_config(skb, nlh, &cfg);
2350         if (err < 0)
2351                 return err;
2352
2353         return ip6_route_add(&cfg);
2354 }
2355
2356 static inline size_t rt6_nlmsg_size(void)
2357 {
2358         return NLMSG_ALIGN(sizeof(struct rtmsg))
2359                + nla_total_size(16) /* RTA_SRC */
2360                + nla_total_size(16) /* RTA_DST */
2361                + nla_total_size(16) /* RTA_GATEWAY */
2362                + nla_total_size(16) /* RTA_PREFSRC */
2363                + nla_total_size(4) /* RTA_TABLE */
2364                + nla_total_size(4) /* RTA_IIF */
2365                + nla_total_size(4) /* RTA_OIF */
2366                + nla_total_size(4) /* RTA_PRIORITY */
2367                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2368                + nla_total_size(sizeof(struct rta_cacheinfo));
2369 }
2370
2371 static int rt6_fill_node(struct net *net,
2372                          struct sk_buff *skb, struct rt6_info *rt,
2373                          struct in6_addr *dst, struct in6_addr *src,
2374                          int iif, int type, u32 portid, u32 seq,
2375                          int prefix, int nowait, unsigned int flags)
2376 {
2377         struct rtmsg *rtm;
2378         struct nlmsghdr *nlh;
2379         long expires;
2380         u32 table;
2381         struct neighbour *n;
2382
2383         if (prefix) {   /* user wants prefix routes only */
2384                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2385                         /* success since this is not a prefix route */
2386                         return 1;
2387                 }
2388         }
2389
2390         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
2391         if (!nlh)
2392                 return -EMSGSIZE;
2393
2394         rtm = nlmsg_data(nlh);
2395         rtm->rtm_family = AF_INET6;
2396         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2397         rtm->rtm_src_len = rt->rt6i_src.plen;
2398         rtm->rtm_tos = 0;
2399         if (rt->rt6i_table)
2400                 table = rt->rt6i_table->tb6_id;
2401         else
2402                 table = RT6_TABLE_UNSPEC;
2403         rtm->rtm_table = table;
2404         if (nla_put_u32(skb, RTA_TABLE, table))
2405                 goto nla_put_failure;
2406         if (rt->rt6i_flags & RTF_REJECT) {
2407                 switch (rt->dst.error) {
2408                 case -EINVAL:
2409                         rtm->rtm_type = RTN_BLACKHOLE;
2410                         break;
2411                 case -EACCES:
2412                         rtm->rtm_type = RTN_PROHIBIT;
2413                         break;
2414                 case -EAGAIN:
2415                         rtm->rtm_type = RTN_THROW;
2416                         break;
2417                 default:
2418                         rtm->rtm_type = RTN_UNREACHABLE;
2419                         break;
2420                 }
2421         }
2422         else if (rt->rt6i_flags & RTF_LOCAL)
2423                 rtm->rtm_type = RTN_LOCAL;
2424         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
2425                 rtm->rtm_type = RTN_LOCAL;
2426         else
2427                 rtm->rtm_type = RTN_UNICAST;
2428         rtm->rtm_flags = 0;
2429         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2430         rtm->rtm_protocol = rt->rt6i_protocol;
2431         if (rt->rt6i_flags & RTF_DYNAMIC)
2432                 rtm->rtm_protocol = RTPROT_REDIRECT;
2433         else if (rt->rt6i_flags & RTF_ADDRCONF) {
2434                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ROUTEINFO))
2435                         rtm->rtm_protocol = RTPROT_RA;
2436                 else
2437                         rtm->rtm_protocol = RTPROT_KERNEL;
2438         }
2439
2440         if (rt->rt6i_flags & RTF_CACHE)
2441                 rtm->rtm_flags |= RTM_F_CLONED;
2442
2443         if (dst) {
2444                 if (nla_put(skb, RTA_DST, 16, dst))
2445                         goto nla_put_failure;
2446                 rtm->rtm_dst_len = 128;
2447         } else if (rtm->rtm_dst_len)
2448                 if (nla_put(skb, RTA_DST, 16, &rt->rt6i_dst.addr))
2449                         goto nla_put_failure;
2450 #ifdef CONFIG_IPV6_SUBTREES
2451         if (src) {
2452                 if (nla_put(skb, RTA_SRC, 16, src))
2453                         goto nla_put_failure;
2454                 rtm->rtm_src_len = 128;
2455         } else if (rtm->rtm_src_len &&
2456                    nla_put(skb, RTA_SRC, 16, &rt->rt6i_src.addr))
2457                 goto nla_put_failure;
2458 #endif
2459         if (iif) {
2460 #ifdef CONFIG_IPV6_MROUTE
2461                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2462                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2463                         if (err <= 0) {
2464                                 if (!nowait) {
2465                                         if (err == 0)
2466                                                 return 0;
2467                                         goto nla_put_failure;
2468                                 } else {
2469                                         if (err == -EMSGSIZE)
2470                                                 goto nla_put_failure;
2471                                 }
2472                         }
2473                 } else
2474 #endif
2475                         if (nla_put_u32(skb, RTA_IIF, iif))
2476                                 goto nla_put_failure;
2477         } else if (dst) {
2478                 struct in6_addr saddr_buf;
2479                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
2480                     nla_put(skb, RTA_PREFSRC, 16, &saddr_buf))
2481                         goto nla_put_failure;
2482         }
2483
2484         if (rt->rt6i_prefsrc.plen) {
2485                 struct in6_addr saddr_buf;
2486                 saddr_buf = rt->rt6i_prefsrc.addr;
2487                 if (nla_put(skb, RTA_PREFSRC, 16, &saddr_buf))
2488                         goto nla_put_failure;
2489         }
2490
2491         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2492                 goto nla_put_failure;
2493
2494         n = rt->n;
2495         if (n) {
2496                 if (nla_put(skb, RTA_GATEWAY, 16, &n->primary_key) < 0)
2497                         goto nla_put_failure;
2498         }
2499
2500         if (rt->dst.dev &&
2501             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2502                 goto nla_put_failure;
2503         if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
2504                 goto nla_put_failure;
2505
2506         expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
2507
2508         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
2509                 goto nla_put_failure;
2510
2511         return nlmsg_end(skb, nlh);
2512
2513 nla_put_failure:
2514         nlmsg_cancel(skb, nlh);
2515         return -EMSGSIZE;
2516 }
2517
2518 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2519 {
2520         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2521         int prefix;
2522
2523         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2524                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2525                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2526         } else
2527                 prefix = 0;
2528
2529         return rt6_fill_node(arg->net,
2530                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2531                      NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
2532                      prefix, 0, NLM_F_MULTI);
2533 }
2534
2535 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2536 {
2537         struct net *net = sock_net(in_skb->sk);
2538         struct nlattr *tb[RTA_MAX+1];
2539         struct rt6_info *rt;
2540         struct sk_buff *skb;
2541         struct rtmsg *rtm;
2542         struct flowi6 fl6;
2543         int err, iif = 0, oif = 0;
2544
2545         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2546         if (err < 0)
2547                 goto errout;
2548
2549         err = -EINVAL;
2550         memset(&fl6, 0, sizeof(fl6));
2551
2552         if (tb[RTA_SRC]) {
2553                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2554                         goto errout;
2555
2556                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
2557         }
2558
2559         if (tb[RTA_DST]) {
2560                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2561                         goto errout;
2562
2563                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
2564         }
2565
2566         if (tb[RTA_IIF])
2567                 iif = nla_get_u32(tb[RTA_IIF]);
2568
2569         if (tb[RTA_OIF])
2570                 oif = nla_get_u32(tb[RTA_OIF]);
2571
2572         if (iif) {
2573                 struct net_device *dev;
2574                 int flags = 0;
2575
2576                 dev = __dev_get_by_index(net, iif);
2577                 if (!dev) {
2578                         err = -ENODEV;
2579                         goto errout;
2580                 }
2581
2582                 fl6.flowi6_iif = iif;
2583
2584                 if (!ipv6_addr_any(&fl6.saddr))
2585                         flags |= RT6_LOOKUP_F_HAS_SADDR;
2586
2587                 rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
2588                                                                flags);
2589         } else {
2590                 fl6.flowi6_oif = oif;
2591
2592                 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
2593         }
2594
2595         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2596         if (!skb) {
2597                 dst_release(&rt->dst);
2598                 err = -ENOBUFS;
2599                 goto errout;
2600         }
2601
2602         /* Reserve room for dummy headers, this skb can pass
2603            through good chunk of routing engine.
2604          */
2605         skb_reset_mac_header(skb);
2606         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2607
2608         skb_dst_set(skb, &rt->dst);
2609
2610         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2611                             RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
2612                             nlh->nlmsg_seq, 0, 0, 0);
2613         if (err < 0) {
2614                 kfree_skb(skb);
2615                 goto errout;
2616         }
2617
2618         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2619 errout:
2620         return err;
2621 }
2622
2623 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2624 {
2625         struct sk_buff *skb;
2626         struct net *net = info->nl_net;
2627         u32 seq;
2628         int err;
2629
2630         err = -ENOBUFS;
2631         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2632
2633         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2634         if (!skb)
2635                 goto errout;
2636
2637         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2638                                 event, info->portid, seq, 0, 0, 0);
2639         if (err < 0) {
2640                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2641                 WARN_ON(err == -EMSGSIZE);
2642                 kfree_skb(skb);
2643                 goto errout;
2644         }
2645         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
2646                     info->nlh, gfp_any());
2647         return;
2648 errout:
2649         if (err < 0)
2650                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2651 }
2652
2653 static int ip6_route_dev_notify(struct notifier_block *this,
2654                                 unsigned long event, void *data)
2655 {
2656         struct net_device *dev = (struct net_device *)data;
2657         struct net *net = dev_net(dev);
2658
2659         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2660                 net->ipv6.ip6_null_entry->dst.dev = dev;
2661                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2662 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2663                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2664                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2665                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2666                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2667 #endif
2668         }
2669
2670         return NOTIFY_OK;
2671 }
2672
2673 /*
2674  *      /proc
2675  */
2676
2677 #ifdef CONFIG_PROC_FS
2678
2679 struct rt6_proc_arg
2680 {
2681         char *buffer;
2682         int offset;
2683         int length;
2684         int skip;
2685         int len;
2686 };
2687
2688 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2689 {
2690         struct seq_file *m = p_arg;
2691         struct neighbour *n;
2692
2693         seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2694
2695 #ifdef CONFIG_IPV6_SUBTREES
2696         seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2697 #else
2698         seq_puts(m, "00000000000000000000000000000000 00 ");
2699 #endif
2700         n = rt->n;
2701         if (n) {
2702                 seq_printf(m, "%pi6", n->primary_key);
2703         } else {
2704                 seq_puts(m, "00000000000000000000000000000000");
2705         }
2706         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2707                    rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2708                    rt->dst.__use, rt->rt6i_flags,
2709                    rt->dst.dev ? rt->dst.dev->name : "");
2710         return 0;
2711 }
2712
2713 static int ipv6_route_show(struct seq_file *m, void *v)
2714 {
2715         struct net *net = (struct net *)m->private;
2716         fib6_clean_all_ro(net, rt6_info_route, 0, m);
2717         return 0;
2718 }
2719
2720 static int ipv6_route_open(struct inode *inode, struct file *file)
2721 {
2722         return single_open_net(inode, file, ipv6_route_show);
2723 }
2724
2725 static const struct file_operations ipv6_route_proc_fops = {
2726         .owner          = THIS_MODULE,
2727         .open           = ipv6_route_open,
2728         .read           = seq_read,
2729         .llseek         = seq_lseek,
2730         .release        = single_release_net,
2731 };
2732
2733 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2734 {
2735         struct net *net = (struct net *)seq->private;
2736         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2737                    net->ipv6.rt6_stats->fib_nodes,
2738                    net->ipv6.rt6_stats->fib_route_nodes,
2739                    net->ipv6.rt6_stats->fib_rt_alloc,
2740                    net->ipv6.rt6_stats->fib_rt_entries,
2741                    net->ipv6.rt6_stats->fib_rt_cache,
2742                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2743                    net->ipv6.rt6_stats->fib_discarded_routes);
2744
2745         return 0;
2746 }
2747
2748 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2749 {
2750         return single_open_net(inode, file, rt6_stats_seq_show);
2751 }
2752
2753 static const struct file_operations rt6_stats_seq_fops = {
2754         .owner   = THIS_MODULE,
2755         .open    = rt6_stats_seq_open,
2756         .read    = seq_read,
2757         .llseek  = seq_lseek,
2758         .release = single_release_net,
2759 };
2760 #endif  /* CONFIG_PROC_FS */
2761
2762 #ifdef CONFIG_SYSCTL
2763
2764 static
2765 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2766                               void __user *buffer, size_t *lenp, loff_t *ppos)
2767 {
2768         struct net *net;
2769         int delay;
2770         if (!write)
2771                 return -EINVAL;
2772
2773         net = (struct net *)ctl->extra1;
2774         delay = net->ipv6.sysctl.flush_delay;
2775         proc_dointvec(ctl, write, buffer, lenp, ppos);
2776         fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2777         return 0;
2778 }
2779
2780 ctl_table ipv6_route_table_template[] = {
2781         {
2782                 .procname       =       "flush",
2783                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2784                 .maxlen         =       sizeof(int),
2785                 .mode           =       0200,
2786                 .proc_handler   =       ipv6_sysctl_rtcache_flush
2787         },
2788         {
2789                 .procname       =       "gc_thresh",
2790                 .data           =       &ip6_dst_ops_template.gc_thresh,
2791                 .maxlen         =       sizeof(int),
2792                 .mode           =       0644,
2793                 .proc_handler   =       proc_dointvec,
2794         },
2795         {
2796                 .procname       =       "max_size",
2797                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2798                 .maxlen         =       sizeof(int),
2799                 .mode           =       0644,
2800                 .proc_handler   =       proc_dointvec,
2801         },
2802         {
2803                 .procname       =       "gc_min_interval",
2804                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2805                 .maxlen         =       sizeof(int),
2806                 .mode           =       0644,
2807                 .proc_handler   =       proc_dointvec_jiffies,
2808         },
2809         {
2810                 .procname       =       "gc_timeout",
2811                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2812                 .maxlen         =       sizeof(int),
2813                 .mode           =       0644,
2814                 .proc_handler   =       proc_dointvec_jiffies,
2815         },
2816         {
2817                 .procname       =       "gc_interval",
2818                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2819                 .maxlen         =       sizeof(int),
2820                 .mode           =       0644,
2821                 .proc_handler   =       proc_dointvec_jiffies,
2822         },
2823         {
2824                 .procname       =       "gc_elasticity",
2825                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2826                 .maxlen         =       sizeof(int),
2827                 .mode           =       0644,
2828                 .proc_handler   =       proc_dointvec,
2829         },
2830         {
2831                 .procname       =       "mtu_expires",
2832                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2833                 .maxlen         =       sizeof(int),
2834                 .mode           =       0644,
2835                 .proc_handler   =       proc_dointvec_jiffies,
2836         },
2837         {
2838                 .procname       =       "min_adv_mss",
2839                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2840                 .maxlen         =       sizeof(int),
2841                 .mode           =       0644,
2842                 .proc_handler   =       proc_dointvec,
2843         },
2844         {
2845                 .procname       =       "gc_min_interval_ms",
2846                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2847                 .maxlen         =       sizeof(int),
2848                 .mode           =       0644,
2849                 .proc_handler   =       proc_dointvec_ms_jiffies,
2850         },
2851         { }
2852 };
2853
2854 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2855 {
2856         struct ctl_table *table;
2857
2858         table = kmemdup(ipv6_route_table_template,
2859                         sizeof(ipv6_route_table_template),
2860                         GFP_KERNEL);
2861
2862         if (table) {
2863                 table[0].data = &net->ipv6.sysctl.flush_delay;
2864                 table[0].extra1 = net;
2865                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2866                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2867                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2868                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2869                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2870                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2871                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2872                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2873                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2874         }
2875
2876         return table;
2877 }
2878 #endif
2879
2880 static int __net_init ip6_route_net_init(struct net *net)
2881 {
2882         int ret = -ENOMEM;
2883
2884         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2885                sizeof(net->ipv6.ip6_dst_ops));
2886
2887         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2888                 goto out_ip6_dst_ops;
2889
2890         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2891                                            sizeof(*net->ipv6.ip6_null_entry),
2892                                            GFP_KERNEL);
2893         if (!net->ipv6.ip6_null_entry)
2894                 goto out_ip6_dst_entries;
2895         net->ipv6.ip6_null_entry->dst.path =
2896                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2897         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2898         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2899                          ip6_template_metrics, true);
2900
2901 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2902         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2903                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2904                                                GFP_KERNEL);
2905         if (!net->ipv6.ip6_prohibit_entry)
2906                 goto out_ip6_null_entry;
2907         net->ipv6.ip6_prohibit_entry->dst.path =
2908                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2909         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2910         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2911                          ip6_template_metrics, true);
2912
2913         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2914                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2915                                                GFP_KERNEL);
2916         if (!net->ipv6.ip6_blk_hole_entry)
2917                 goto out_ip6_prohibit_entry;
2918         net->ipv6.ip6_blk_hole_entry->dst.path =
2919                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2920         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2921         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2922                          ip6_template_metrics, true);
2923 #endif
2924
2925         net->ipv6.sysctl.flush_delay = 0;
2926         net->ipv6.sysctl.ip6_rt_max_size = 4096;
2927         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2928         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2929         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2930         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2931         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2932         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2933
2934         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2935
2936         ret = 0;
2937 out:
2938         return ret;
2939
2940 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2941 out_ip6_prohibit_entry:
2942         kfree(net->ipv6.ip6_prohibit_entry);
2943 out_ip6_null_entry:
2944         kfree(net->ipv6.ip6_null_entry);
2945 #endif
2946 out_ip6_dst_entries:
2947         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2948 out_ip6_dst_ops:
2949         goto out;
2950 }
2951
2952 static void __net_exit ip6_route_net_exit(struct net *net)
2953 {
2954         kfree(net->ipv6.ip6_null_entry);
2955 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2956         kfree(net->ipv6.ip6_prohibit_entry);
2957         kfree(net->ipv6.ip6_blk_hole_entry);
2958 #endif
2959         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2960 }
2961
2962 static int __net_init ip6_route_net_init_late(struct net *net)
2963 {
2964 #ifdef CONFIG_PROC_FS
2965         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2966         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2967 #endif
2968         return 0;
2969 }
2970
2971 static void __net_exit ip6_route_net_exit_late(struct net *net)
2972 {
2973 #ifdef CONFIG_PROC_FS
2974         proc_net_remove(net, "ipv6_route");
2975         proc_net_remove(net, "rt6_stats");
2976 #endif
2977 }
2978
2979 static struct pernet_operations ip6_route_net_ops = {
2980         .init = ip6_route_net_init,
2981         .exit = ip6_route_net_exit,
2982 };
2983
2984 static int __net_init ipv6_inetpeer_init(struct net *net)
2985 {
2986         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2987
2988         if (!bp)
2989                 return -ENOMEM;
2990         inet_peer_base_init(bp);
2991         net->ipv6.peers = bp;
2992         return 0;
2993 }
2994
2995 static void __net_exit ipv6_inetpeer_exit(struct net *net)
2996 {
2997         struct inet_peer_base *bp = net->ipv6.peers;
2998
2999         net->ipv6.peers = NULL;
3000         inetpeer_invalidate_tree(bp);
3001         kfree(bp);
3002 }
3003
3004 static struct pernet_operations ipv6_inetpeer_ops = {
3005         .init   =       ipv6_inetpeer_init,
3006         .exit   =       ipv6_inetpeer_exit,
3007 };
3008
3009 static struct pernet_operations ip6_route_net_late_ops = {
3010         .init = ip6_route_net_init_late,
3011         .exit = ip6_route_net_exit_late,
3012 };
3013
3014 static struct notifier_block ip6_route_dev_notifier = {
3015         .notifier_call = ip6_route_dev_notify,
3016         .priority = 0,
3017 };
3018
3019 int __init ip6_route_init(void)
3020 {
3021         int ret;
3022
3023         ret = -ENOMEM;
3024         ip6_dst_ops_template.kmem_cachep =
3025                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
3026                                   SLAB_HWCACHE_ALIGN, NULL);
3027         if (!ip6_dst_ops_template.kmem_cachep)
3028                 goto out;
3029
3030         ret = dst_entries_init(&ip6_dst_blackhole_ops);
3031         if (ret)
3032                 goto out_kmem_cache;
3033
3034         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
3035         if (ret)
3036                 goto out_dst_entries;
3037
3038         ret = register_pernet_subsys(&ip6_route_net_ops);
3039         if (ret)
3040                 goto out_register_inetpeer;
3041
3042         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
3043
3044         /* Registering of the loopback is done before this portion of code,
3045          * the loopback reference in rt6_info will not be taken, do it
3046          * manually for init_net */
3047         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
3048         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3049   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3050         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
3051         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3052         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
3053         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3054   #endif
3055         ret = fib6_init();
3056         if (ret)
3057                 goto out_register_subsys;
3058
3059         ret = xfrm6_init();
3060         if (ret)
3061                 goto out_fib6_init;
3062
3063         ret = fib6_rules_init();
3064         if (ret)
3065                 goto xfrm6_init;
3066
3067         ret = register_pernet_subsys(&ip6_route_net_late_ops);
3068         if (ret)
3069                 goto fib6_rules_init;
3070
3071         ret = -ENOBUFS;
3072         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3073             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3074             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3075                 goto out_register_late_subsys;
3076
3077         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3078         if (ret)
3079                 goto out_register_late_subsys;
3080
3081 out:
3082         return ret;
3083
3084 out_register_late_subsys:
3085         unregister_pernet_subsys(&ip6_route_net_late_ops);
3086 fib6_rules_init:
3087         fib6_rules_cleanup();
3088 xfrm6_init:
3089         xfrm6_fini();
3090 out_fib6_init:
3091         fib6_gc_cleanup();
3092 out_register_subsys:
3093         unregister_pernet_subsys(&ip6_route_net_ops);
3094 out_register_inetpeer:
3095         unregister_pernet_subsys(&ipv6_inetpeer_ops);
3096 out_dst_entries:
3097         dst_entries_destroy(&ip6_dst_blackhole_ops);
3098 out_kmem_cache:
3099         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3100         goto out;
3101 }
3102
3103 void ip6_route_cleanup(void)
3104 {
3105         unregister_netdevice_notifier(&ip6_route_dev_notifier);
3106         unregister_pernet_subsys(&ip6_route_net_late_ops);
3107         fib6_rules_cleanup();
3108         xfrm6_fini();
3109         fib6_gc_cleanup();
3110         unregister_pernet_subsys(&ipv6_inetpeer_ops);
3111         unregister_pernet_subsys(&ip6_route_net_ops);
3112         dst_entries_destroy(&ip6_dst_blackhole_ops);
3113         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3114 }