OSDN Git Service

Merge branch 'locking-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...
[uclinux-h8/linux.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
48 #include <net/snmp.h>
49 #include <net/ipv6.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
54 #include <net/tcp.h>
55 #include <linux/rtnetlink.h>
56 #include <net/dst.h>
57 #include <net/xfrm.h>
58 #include <net/netevent.h>
59 #include <net/netlink.h>
60 #include <net/nexthop.h>
61
62 #include <asm/uaccess.h>
63
64 #ifdef CONFIG_SYSCTL
65 #include <linux/sysctl.h>
66 #endif
67
68 enum rt6_nud_state {
69         RT6_NUD_FAIL_HARD = -3,
70         RT6_NUD_FAIL_PROBE = -2,
71         RT6_NUD_FAIL_DO_RR = -1,
72         RT6_NUD_SUCCEED = 1
73 };
74
75 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
76 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
77 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
78 static unsigned int      ip6_mtu(const struct dst_entry *dst);
79 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
80 static void             ip6_dst_destroy(struct dst_entry *);
81 static void             ip6_dst_ifdown(struct dst_entry *,
82                                        struct net_device *dev, int how);
83 static int               ip6_dst_gc(struct dst_ops *ops);
84
85 static int              ip6_pkt_discard(struct sk_buff *skb);
86 static int              ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb);
87 static int              ip6_pkt_prohibit(struct sk_buff *skb);
88 static int              ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb);
89 static void             ip6_link_failure(struct sk_buff *skb);
90 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
91                                            struct sk_buff *skb, u32 mtu);
92 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
93                                         struct sk_buff *skb);
94 static void             rt6_dst_from_metrics_check(struct rt6_info *rt);
95 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
96
97 #ifdef CONFIG_IPV6_ROUTE_INFO
98 static struct rt6_info *rt6_add_route_info(struct net *net,
99                                            const struct in6_addr *prefix, int prefixlen,
100                                            const struct in6_addr *gwaddr, int ifindex,
101                                            unsigned int pref);
102 static struct rt6_info *rt6_get_route_info(struct net *net,
103                                            const struct in6_addr *prefix, int prefixlen,
104                                            const struct in6_addr *gwaddr, int ifindex);
105 #endif
106
107 struct uncached_list {
108         spinlock_t              lock;
109         struct list_head        head;
110 };
111
112 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
113
114 static void rt6_uncached_list_add(struct rt6_info *rt)
115 {
116         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
117
118         rt->dst.flags |= DST_NOCACHE;
119         rt->rt6i_uncached_list = ul;
120
121         spin_lock_bh(&ul->lock);
122         list_add_tail(&rt->rt6i_uncached, &ul->head);
123         spin_unlock_bh(&ul->lock);
124 }
125
126 static void rt6_uncached_list_del(struct rt6_info *rt)
127 {
128         if (!list_empty(&rt->rt6i_uncached)) {
129                 struct uncached_list *ul = rt->rt6i_uncached_list;
130
131                 spin_lock_bh(&ul->lock);
132                 list_del(&rt->rt6i_uncached);
133                 spin_unlock_bh(&ul->lock);
134         }
135 }
136
137 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
138 {
139         struct net_device *loopback_dev = net->loopback_dev;
140         int cpu;
141
142         for_each_possible_cpu(cpu) {
143                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
144                 struct rt6_info *rt;
145
146                 spin_lock_bh(&ul->lock);
147                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
148                         struct inet6_dev *rt_idev = rt->rt6i_idev;
149                         struct net_device *rt_dev = rt->dst.dev;
150
151                         if (rt_idev && (rt_idev->dev == dev || !dev) &&
152                             rt_idev->dev != loopback_dev) {
153                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
154                                 in6_dev_put(rt_idev);
155                         }
156
157                         if (rt_dev && (rt_dev == dev || !dev) &&
158                             rt_dev != loopback_dev) {
159                                 rt->dst.dev = loopback_dev;
160                                 dev_hold(rt->dst.dev);
161                                 dev_put(rt_dev);
162                         }
163                 }
164                 spin_unlock_bh(&ul->lock);
165         }
166 }
167
168 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
169 {
170         return dst_metrics_write_ptr(rt->dst.from);
171 }
172
173 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
174 {
175         struct rt6_info *rt = (struct rt6_info *)dst;
176
177         if (rt->rt6i_flags & RTF_PCPU)
178                 return rt6_pcpu_cow_metrics(rt);
179         else if (rt->rt6i_flags & RTF_CACHE)
180                 return NULL;
181         else
182                 return dst_cow_metrics_generic(dst, old);
183 }
184
185 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
186                                              struct sk_buff *skb,
187                                              const void *daddr)
188 {
189         struct in6_addr *p = &rt->rt6i_gateway;
190
191         if (!ipv6_addr_any(p))
192                 return (const void *) p;
193         else if (skb)
194                 return &ipv6_hdr(skb)->daddr;
195         return daddr;
196 }
197
198 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
199                                           struct sk_buff *skb,
200                                           const void *daddr)
201 {
202         struct rt6_info *rt = (struct rt6_info *) dst;
203         struct neighbour *n;
204
205         daddr = choose_neigh_daddr(rt, skb, daddr);
206         n = __ipv6_neigh_lookup(dst->dev, daddr);
207         if (n)
208                 return n;
209         return neigh_create(&nd_tbl, daddr, dst->dev);
210 }
211
212 static struct dst_ops ip6_dst_ops_template = {
213         .family                 =       AF_INET6,
214         .gc                     =       ip6_dst_gc,
215         .gc_thresh              =       1024,
216         .check                  =       ip6_dst_check,
217         .default_advmss         =       ip6_default_advmss,
218         .mtu                    =       ip6_mtu,
219         .cow_metrics            =       ipv6_cow_metrics,
220         .destroy                =       ip6_dst_destroy,
221         .ifdown                 =       ip6_dst_ifdown,
222         .negative_advice        =       ip6_negative_advice,
223         .link_failure           =       ip6_link_failure,
224         .update_pmtu            =       ip6_rt_update_pmtu,
225         .redirect               =       rt6_do_redirect,
226         .local_out              =       __ip6_local_out,
227         .neigh_lookup           =       ip6_neigh_lookup,
228 };
229
230 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
231 {
232         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
233
234         return mtu ? : dst->dev->mtu;
235 }
236
237 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
238                                          struct sk_buff *skb, u32 mtu)
239 {
240 }
241
242 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
243                                       struct sk_buff *skb)
244 {
245 }
246
247 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
248                                          unsigned long old)
249 {
250         return NULL;
251 }
252
253 static struct dst_ops ip6_dst_blackhole_ops = {
254         .family                 =       AF_INET6,
255         .destroy                =       ip6_dst_destroy,
256         .check                  =       ip6_dst_check,
257         .mtu                    =       ip6_blackhole_mtu,
258         .default_advmss         =       ip6_default_advmss,
259         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
260         .redirect               =       ip6_rt_blackhole_redirect,
261         .cow_metrics            =       ip6_rt_blackhole_cow_metrics,
262         .neigh_lookup           =       ip6_neigh_lookup,
263 };
264
265 static const u32 ip6_template_metrics[RTAX_MAX] = {
266         [RTAX_HOPLIMIT - 1] = 0,
267 };
268
269 static const struct rt6_info ip6_null_entry_template = {
270         .dst = {
271                 .__refcnt       = ATOMIC_INIT(1),
272                 .__use          = 1,
273                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
274                 .error          = -ENETUNREACH,
275                 .input          = ip6_pkt_discard,
276                 .output         = ip6_pkt_discard_out,
277         },
278         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
279         .rt6i_protocol  = RTPROT_KERNEL,
280         .rt6i_metric    = ~(u32) 0,
281         .rt6i_ref       = ATOMIC_INIT(1),
282 };
283
284 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
285
286 static const struct rt6_info ip6_prohibit_entry_template = {
287         .dst = {
288                 .__refcnt       = ATOMIC_INIT(1),
289                 .__use          = 1,
290                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
291                 .error          = -EACCES,
292                 .input          = ip6_pkt_prohibit,
293                 .output         = ip6_pkt_prohibit_out,
294         },
295         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
296         .rt6i_protocol  = RTPROT_KERNEL,
297         .rt6i_metric    = ~(u32) 0,
298         .rt6i_ref       = ATOMIC_INIT(1),
299 };
300
301 static const struct rt6_info ip6_blk_hole_entry_template = {
302         .dst = {
303                 .__refcnt       = ATOMIC_INIT(1),
304                 .__use          = 1,
305                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
306                 .error          = -EINVAL,
307                 .input          = dst_discard,
308                 .output         = dst_discard_sk,
309         },
310         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
311         .rt6i_protocol  = RTPROT_KERNEL,
312         .rt6i_metric    = ~(u32) 0,
313         .rt6i_ref       = ATOMIC_INIT(1),
314 };
315
316 #endif
317
318 /* allocate dst with ip6_dst_ops */
319 static struct rt6_info *__ip6_dst_alloc(struct net *net,
320                                         struct net_device *dev,
321                                         int flags,
322                                         struct fib6_table *table)
323 {
324         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
325                                         0, DST_OBSOLETE_FORCE_CHK, flags);
326
327         if (rt) {
328                 struct dst_entry *dst = &rt->dst;
329
330                 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
331                 INIT_LIST_HEAD(&rt->rt6i_siblings);
332                 INIT_LIST_HEAD(&rt->rt6i_uncached);
333         }
334         return rt;
335 }
336
337 static struct rt6_info *ip6_dst_alloc(struct net *net,
338                                       struct net_device *dev,
339                                       int flags,
340                                       struct fib6_table *table)
341 {
342         struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags, table);
343
344         if (rt) {
345                 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
346                 if (rt->rt6i_pcpu) {
347                         int cpu;
348
349                         for_each_possible_cpu(cpu) {
350                                 struct rt6_info **p;
351
352                                 p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
353                                 /* no one shares rt */
354                                 *p =  NULL;
355                         }
356                 } else {
357                         dst_destroy((struct dst_entry *)rt);
358                         return NULL;
359                 }
360         }
361
362         return rt;
363 }
364
365 static void ip6_dst_destroy(struct dst_entry *dst)
366 {
367         struct rt6_info *rt = (struct rt6_info *)dst;
368         struct dst_entry *from = dst->from;
369         struct inet6_dev *idev;
370
371         dst_destroy_metrics_generic(dst);
372         free_percpu(rt->rt6i_pcpu);
373         rt6_uncached_list_del(rt);
374
375         idev = rt->rt6i_idev;
376         if (idev) {
377                 rt->rt6i_idev = NULL;
378                 in6_dev_put(idev);
379         }
380
381         dst->from = NULL;
382         dst_release(from);
383 }
384
385 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
386                            int how)
387 {
388         struct rt6_info *rt = (struct rt6_info *)dst;
389         struct inet6_dev *idev = rt->rt6i_idev;
390         struct net_device *loopback_dev =
391                 dev_net(dev)->loopback_dev;
392
393         if (dev != loopback_dev) {
394                 if (idev && idev->dev == dev) {
395                         struct inet6_dev *loopback_idev =
396                                 in6_dev_get(loopback_dev);
397                         if (loopback_idev) {
398                                 rt->rt6i_idev = loopback_idev;
399                                 in6_dev_put(idev);
400                         }
401                 }
402         }
403 }
404
405 static bool rt6_check_expired(const struct rt6_info *rt)
406 {
407         if (rt->rt6i_flags & RTF_EXPIRES) {
408                 if (time_after(jiffies, rt->dst.expires))
409                         return true;
410         } else if (rt->dst.from) {
411                 return rt6_check_expired((struct rt6_info *) rt->dst.from);
412         }
413         return false;
414 }
415
416 /* Multipath route selection:
417  *   Hash based function using packet header and flowlabel.
418  * Adapted from fib_info_hashfn()
419  */
420 static int rt6_info_hash_nhsfn(unsigned int candidate_count,
421                                const struct flowi6 *fl6)
422 {
423         unsigned int val = fl6->flowi6_proto;
424
425         val ^= ipv6_addr_hash(&fl6->daddr);
426         val ^= ipv6_addr_hash(&fl6->saddr);
427
428         /* Work only if this not encapsulated */
429         switch (fl6->flowi6_proto) {
430         case IPPROTO_UDP:
431         case IPPROTO_TCP:
432         case IPPROTO_SCTP:
433                 val ^= (__force u16)fl6->fl6_sport;
434                 val ^= (__force u16)fl6->fl6_dport;
435                 break;
436
437         case IPPROTO_ICMPV6:
438                 val ^= (__force u16)fl6->fl6_icmp_type;
439                 val ^= (__force u16)fl6->fl6_icmp_code;
440                 break;
441         }
442         /* RFC6438 recommands to use flowlabel */
443         val ^= (__force u32)fl6->flowlabel;
444
445         /* Perhaps, we need to tune, this function? */
446         val = val ^ (val >> 7) ^ (val >> 12);
447         return val % candidate_count;
448 }
449
450 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
451                                              struct flowi6 *fl6, int oif,
452                                              int strict)
453 {
454         struct rt6_info *sibling, *next_sibling;
455         int route_choosen;
456
457         route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
458         /* Don't change the route, if route_choosen == 0
459          * (siblings does not include ourself)
460          */
461         if (route_choosen)
462                 list_for_each_entry_safe(sibling, next_sibling,
463                                 &match->rt6i_siblings, rt6i_siblings) {
464                         route_choosen--;
465                         if (route_choosen == 0) {
466                                 if (rt6_score_route(sibling, oif, strict) < 0)
467                                         break;
468                                 match = sibling;
469                                 break;
470                         }
471                 }
472         return match;
473 }
474
475 /*
476  *      Route lookup. Any table->tb6_lock is implied.
477  */
478
479 static inline struct rt6_info *rt6_device_match(struct net *net,
480                                                     struct rt6_info *rt,
481                                                     const struct in6_addr *saddr,
482                                                     int oif,
483                                                     int flags)
484 {
485         struct rt6_info *local = NULL;
486         struct rt6_info *sprt;
487
488         if (!oif && ipv6_addr_any(saddr))
489                 goto out;
490
491         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
492                 struct net_device *dev = sprt->dst.dev;
493
494                 if (oif) {
495                         if (dev->ifindex == oif)
496                                 return sprt;
497                         if (dev->flags & IFF_LOOPBACK) {
498                                 if (!sprt->rt6i_idev ||
499                                     sprt->rt6i_idev->dev->ifindex != oif) {
500                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
501                                                 continue;
502                                         if (local && (!oif ||
503                                                       local->rt6i_idev->dev->ifindex == oif))
504                                                 continue;
505                                 }
506                                 local = sprt;
507                         }
508                 } else {
509                         if (ipv6_chk_addr(net, saddr, dev,
510                                           flags & RT6_LOOKUP_F_IFACE))
511                                 return sprt;
512                 }
513         }
514
515         if (oif) {
516                 if (local)
517                         return local;
518
519                 if (flags & RT6_LOOKUP_F_IFACE)
520                         return net->ipv6.ip6_null_entry;
521         }
522 out:
523         return rt;
524 }
525
526 #ifdef CONFIG_IPV6_ROUTER_PREF
527 struct __rt6_probe_work {
528         struct work_struct work;
529         struct in6_addr target;
530         struct net_device *dev;
531 };
532
533 static void rt6_probe_deferred(struct work_struct *w)
534 {
535         struct in6_addr mcaddr;
536         struct __rt6_probe_work *work =
537                 container_of(w, struct __rt6_probe_work, work);
538
539         addrconf_addr_solict_mult(&work->target, &mcaddr);
540         ndisc_send_ns(work->dev, NULL, &work->target, &mcaddr, NULL);
541         dev_put(work->dev);
542         kfree(work);
543 }
544
545 static void rt6_probe(struct rt6_info *rt)
546 {
547         struct neighbour *neigh;
548         /*
549          * Okay, this does not seem to be appropriate
550          * for now, however, we need to check if it
551          * is really so; aka Router Reachability Probing.
552          *
553          * Router Reachability Probe MUST be rate-limited
554          * to no more than one per minute.
555          */
556         if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
557                 return;
558         rcu_read_lock_bh();
559         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
560         if (neigh) {
561                 write_lock(&neigh->lock);
562                 if (neigh->nud_state & NUD_VALID)
563                         goto out;
564         }
565
566         if (!neigh ||
567             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
568                 struct __rt6_probe_work *work;
569
570                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
571
572                 if (neigh && work)
573                         __neigh_set_probe_once(neigh);
574
575                 if (neigh)
576                         write_unlock(&neigh->lock);
577
578                 if (work) {
579                         INIT_WORK(&work->work, rt6_probe_deferred);
580                         work->target = rt->rt6i_gateway;
581                         dev_hold(rt->dst.dev);
582                         work->dev = rt->dst.dev;
583                         schedule_work(&work->work);
584                 }
585         } else {
586 out:
587                 write_unlock(&neigh->lock);
588         }
589         rcu_read_unlock_bh();
590 }
591 #else
592 static inline void rt6_probe(struct rt6_info *rt)
593 {
594 }
595 #endif
596
597 /*
598  * Default Router Selection (RFC 2461 6.3.6)
599  */
600 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
601 {
602         struct net_device *dev = rt->dst.dev;
603         if (!oif || dev->ifindex == oif)
604                 return 2;
605         if ((dev->flags & IFF_LOOPBACK) &&
606             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
607                 return 1;
608         return 0;
609 }
610
611 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
612 {
613         struct neighbour *neigh;
614         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
615
616         if (rt->rt6i_flags & RTF_NONEXTHOP ||
617             !(rt->rt6i_flags & RTF_GATEWAY))
618                 return RT6_NUD_SUCCEED;
619
620         rcu_read_lock_bh();
621         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
622         if (neigh) {
623                 read_lock(&neigh->lock);
624                 if (neigh->nud_state & NUD_VALID)
625                         ret = RT6_NUD_SUCCEED;
626 #ifdef CONFIG_IPV6_ROUTER_PREF
627                 else if (!(neigh->nud_state & NUD_FAILED))
628                         ret = RT6_NUD_SUCCEED;
629                 else
630                         ret = RT6_NUD_FAIL_PROBE;
631 #endif
632                 read_unlock(&neigh->lock);
633         } else {
634                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
635                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
636         }
637         rcu_read_unlock_bh();
638
639         return ret;
640 }
641
642 static int rt6_score_route(struct rt6_info *rt, int oif,
643                            int strict)
644 {
645         int m;
646
647         m = rt6_check_dev(rt, oif);
648         if (!m && (strict & RT6_LOOKUP_F_IFACE))
649                 return RT6_NUD_FAIL_HARD;
650 #ifdef CONFIG_IPV6_ROUTER_PREF
651         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
652 #endif
653         if (strict & RT6_LOOKUP_F_REACHABLE) {
654                 int n = rt6_check_neigh(rt);
655                 if (n < 0)
656                         return n;
657         }
658         return m;
659 }
660
661 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
662                                    int *mpri, struct rt6_info *match,
663                                    bool *do_rr)
664 {
665         int m;
666         bool match_do_rr = false;
667
668         if (rt6_check_expired(rt))
669                 goto out;
670
671         m = rt6_score_route(rt, oif, strict);
672         if (m == RT6_NUD_FAIL_DO_RR) {
673                 match_do_rr = true;
674                 m = 0; /* lowest valid score */
675         } else if (m == RT6_NUD_FAIL_HARD) {
676                 goto out;
677         }
678
679         if (strict & RT6_LOOKUP_F_REACHABLE)
680                 rt6_probe(rt);
681
682         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
683         if (m > *mpri) {
684                 *do_rr = match_do_rr;
685                 *mpri = m;
686                 match = rt;
687         }
688 out:
689         return match;
690 }
691
692 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
693                                      struct rt6_info *rr_head,
694                                      u32 metric, int oif, int strict,
695                                      bool *do_rr)
696 {
697         struct rt6_info *rt, *match, *cont;
698         int mpri = -1;
699
700         match = NULL;
701         cont = NULL;
702         for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
703                 if (rt->rt6i_metric != metric) {
704                         cont = rt;
705                         break;
706                 }
707
708                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
709         }
710
711         for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
712                 if (rt->rt6i_metric != metric) {
713                         cont = rt;
714                         break;
715                 }
716
717                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
718         }
719
720         if (match || !cont)
721                 return match;
722
723         for (rt = cont; rt; rt = rt->dst.rt6_next)
724                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
725
726         return match;
727 }
728
729 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
730 {
731         struct rt6_info *match, *rt0;
732         struct net *net;
733         bool do_rr = false;
734
735         rt0 = fn->rr_ptr;
736         if (!rt0)
737                 fn->rr_ptr = rt0 = fn->leaf;
738
739         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
740                              &do_rr);
741
742         if (do_rr) {
743                 struct rt6_info *next = rt0->dst.rt6_next;
744
745                 /* no entries matched; do round-robin */
746                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
747                         next = fn->leaf;
748
749                 if (next != rt0)
750                         fn->rr_ptr = next;
751         }
752
753         net = dev_net(rt0->dst.dev);
754         return match ? match : net->ipv6.ip6_null_entry;
755 }
756
757 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
758 {
759         return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
760 }
761
762 #ifdef CONFIG_IPV6_ROUTE_INFO
763 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
764                   const struct in6_addr *gwaddr)
765 {
766         struct net *net = dev_net(dev);
767         struct route_info *rinfo = (struct route_info *) opt;
768         struct in6_addr prefix_buf, *prefix;
769         unsigned int pref;
770         unsigned long lifetime;
771         struct rt6_info *rt;
772
773         if (len < sizeof(struct route_info)) {
774                 return -EINVAL;
775         }
776
777         /* Sanity check for prefix_len and length */
778         if (rinfo->length > 3) {
779                 return -EINVAL;
780         } else if (rinfo->prefix_len > 128) {
781                 return -EINVAL;
782         } else if (rinfo->prefix_len > 64) {
783                 if (rinfo->length < 2) {
784                         return -EINVAL;
785                 }
786         } else if (rinfo->prefix_len > 0) {
787                 if (rinfo->length < 1) {
788                         return -EINVAL;
789                 }
790         }
791
792         pref = rinfo->route_pref;
793         if (pref == ICMPV6_ROUTER_PREF_INVALID)
794                 return -EINVAL;
795
796         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
797
798         if (rinfo->length == 3)
799                 prefix = (struct in6_addr *)rinfo->prefix;
800         else {
801                 /* this function is safe */
802                 ipv6_addr_prefix(&prefix_buf,
803                                  (struct in6_addr *)rinfo->prefix,
804                                  rinfo->prefix_len);
805                 prefix = &prefix_buf;
806         }
807
808         if (rinfo->prefix_len == 0)
809                 rt = rt6_get_dflt_router(gwaddr, dev);
810         else
811                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
812                                         gwaddr, dev->ifindex);
813
814         if (rt && !lifetime) {
815                 ip6_del_rt(rt);
816                 rt = NULL;
817         }
818
819         if (!rt && lifetime)
820                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
821                                         pref);
822         else if (rt)
823                 rt->rt6i_flags = RTF_ROUTEINFO |
824                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
825
826         if (rt) {
827                 if (!addrconf_finite_timeout(lifetime))
828                         rt6_clean_expires(rt);
829                 else
830                         rt6_set_expires(rt, jiffies + HZ * lifetime);
831
832                 ip6_rt_put(rt);
833         }
834         return 0;
835 }
836 #endif
837
838 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
839                                         struct in6_addr *saddr)
840 {
841         struct fib6_node *pn;
842         while (1) {
843                 if (fn->fn_flags & RTN_TL_ROOT)
844                         return NULL;
845                 pn = fn->parent;
846                 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
847                         fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
848                 else
849                         fn = pn;
850                 if (fn->fn_flags & RTN_RTINFO)
851                         return fn;
852         }
853 }
854
855 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
856                                              struct fib6_table *table,
857                                              struct flowi6 *fl6, int flags)
858 {
859         struct fib6_node *fn;
860         struct rt6_info *rt;
861
862         read_lock_bh(&table->tb6_lock);
863         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
864 restart:
865         rt = fn->leaf;
866         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
867         if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
868                 rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
869         if (rt == net->ipv6.ip6_null_entry) {
870                 fn = fib6_backtrack(fn, &fl6->saddr);
871                 if (fn)
872                         goto restart;
873         }
874         dst_use(&rt->dst, jiffies);
875         read_unlock_bh(&table->tb6_lock);
876         return rt;
877
878 }
879
880 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
881                                     int flags)
882 {
883         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
884 }
885 EXPORT_SYMBOL_GPL(ip6_route_lookup);
886
887 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
888                             const struct in6_addr *saddr, int oif, int strict)
889 {
890         struct flowi6 fl6 = {
891                 .flowi6_oif = oif,
892                 .daddr = *daddr,
893         };
894         struct dst_entry *dst;
895         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
896
897         if (saddr) {
898                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
899                 flags |= RT6_LOOKUP_F_HAS_SADDR;
900         }
901
902         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
903         if (dst->error == 0)
904                 return (struct rt6_info *) dst;
905
906         dst_release(dst);
907
908         return NULL;
909 }
910 EXPORT_SYMBOL(rt6_lookup);
911
912 /* ip6_ins_rt is called with FREE table->tb6_lock.
913    It takes new route entry, the addition fails by any reason the
914    route is freed. In any case, if caller does not hold it, it may
915    be destroyed.
916  */
917
918 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
919                         struct mx6_config *mxc)
920 {
921         int err;
922         struct fib6_table *table;
923
924         table = rt->rt6i_table;
925         write_lock_bh(&table->tb6_lock);
926         err = fib6_add(&table->tb6_root, rt, info, mxc);
927         write_unlock_bh(&table->tb6_lock);
928
929         return err;
930 }
931
932 int ip6_ins_rt(struct rt6_info *rt)
933 {
934         struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
935         struct mx6_config mxc = { .mx = NULL, };
936
937         return __ip6_ins_rt(rt, &info, &mxc);
938 }
939
940 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
941                                            const struct in6_addr *daddr,
942                                            const struct in6_addr *saddr)
943 {
944         struct rt6_info *rt;
945
946         /*
947          *      Clone the route.
948          */
949
950         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
951                 ort = (struct rt6_info *)ort->dst.from;
952
953         rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev,
954                              0, ort->rt6i_table);
955
956         if (!rt)
957                 return NULL;
958
959         ip6_rt_copy_init(rt, ort);
960         rt->rt6i_flags |= RTF_CACHE;
961         rt->rt6i_metric = 0;
962         rt->dst.flags |= DST_HOST;
963         rt->rt6i_dst.addr = *daddr;
964         rt->rt6i_dst.plen = 128;
965
966         if (!rt6_is_gw_or_nonexthop(ort)) {
967                 if (ort->rt6i_dst.plen != 128 &&
968                     ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
969                         rt->rt6i_flags |= RTF_ANYCAST;
970 #ifdef CONFIG_IPV6_SUBTREES
971                 if (rt->rt6i_src.plen && saddr) {
972                         rt->rt6i_src.addr = *saddr;
973                         rt->rt6i_src.plen = 128;
974                 }
975 #endif
976         }
977
978         return rt;
979 }
980
981 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
982 {
983         struct rt6_info *pcpu_rt;
984
985         pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev),
986                                   rt->dst.dev, rt->dst.flags,
987                                   rt->rt6i_table);
988
989         if (!pcpu_rt)
990                 return NULL;
991         ip6_rt_copy_init(pcpu_rt, rt);
992         pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
993         pcpu_rt->rt6i_flags |= RTF_PCPU;
994         return pcpu_rt;
995 }
996
997 /* It should be called with read_lock_bh(&tb6_lock) acquired */
998 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
999 {
1000         struct rt6_info *pcpu_rt, *prev, **p;
1001
1002         p = this_cpu_ptr(rt->rt6i_pcpu);
1003         pcpu_rt = *p;
1004
1005         if (pcpu_rt)
1006                 goto done;
1007
1008         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1009         if (!pcpu_rt) {
1010                 struct net *net = dev_net(rt->dst.dev);
1011
1012                 pcpu_rt = net->ipv6.ip6_null_entry;
1013                 goto done;
1014         }
1015
1016         prev = cmpxchg(p, NULL, pcpu_rt);
1017         if (prev) {
1018                 /* If someone did it before us, return prev instead */
1019                 dst_destroy(&pcpu_rt->dst);
1020                 pcpu_rt = prev;
1021         }
1022
1023 done:
1024         dst_hold(&pcpu_rt->dst);
1025         rt6_dst_from_metrics_check(pcpu_rt);
1026         return pcpu_rt;
1027 }
1028
1029 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
1030                                       struct flowi6 *fl6, int flags)
1031 {
1032         struct fib6_node *fn, *saved_fn;
1033         struct rt6_info *rt;
1034         int strict = 0;
1035
1036         strict |= flags & RT6_LOOKUP_F_IFACE;
1037         if (net->ipv6.devconf_all->forwarding == 0)
1038                 strict |= RT6_LOOKUP_F_REACHABLE;
1039
1040         read_lock_bh(&table->tb6_lock);
1041
1042         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1043         saved_fn = fn;
1044
1045 redo_rt6_select:
1046         rt = rt6_select(fn, oif, strict);
1047         if (rt->rt6i_nsiblings)
1048                 rt = rt6_multipath_select(rt, fl6, oif, strict);
1049         if (rt == net->ipv6.ip6_null_entry) {
1050                 fn = fib6_backtrack(fn, &fl6->saddr);
1051                 if (fn)
1052                         goto redo_rt6_select;
1053                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1054                         /* also consider unreachable route */
1055                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1056                         fn = saved_fn;
1057                         goto redo_rt6_select;
1058                 }
1059         }
1060
1061
1062         if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
1063                 dst_use(&rt->dst, jiffies);
1064                 read_unlock_bh(&table->tb6_lock);
1065
1066                 rt6_dst_from_metrics_check(rt);
1067                 return rt;
1068         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1069                             !(rt->rt6i_flags & RTF_GATEWAY))) {
1070                 /* Create a RTF_CACHE clone which will not be
1071                  * owned by the fib6 tree.  It is for the special case where
1072                  * the daddr in the skb during the neighbor look-up is different
1073                  * from the fl6->daddr used to look-up route here.
1074                  */
1075
1076                 struct rt6_info *uncached_rt;
1077
1078                 dst_use(&rt->dst, jiffies);
1079                 read_unlock_bh(&table->tb6_lock);
1080
1081                 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1082                 dst_release(&rt->dst);
1083
1084                 if (uncached_rt)
1085                         rt6_uncached_list_add(uncached_rt);
1086                 else
1087                         uncached_rt = net->ipv6.ip6_null_entry;
1088
1089                 dst_hold(&uncached_rt->dst);
1090                 return uncached_rt;
1091
1092         } else {
1093                 /* Get a percpu copy */
1094
1095                 struct rt6_info *pcpu_rt;
1096
1097                 rt->dst.lastuse = jiffies;
1098                 rt->dst.__use++;
1099                 pcpu_rt = rt6_get_pcpu_route(rt);
1100                 read_unlock_bh(&table->tb6_lock);
1101
1102                 return pcpu_rt;
1103         }
1104 }
1105
1106 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1107                                             struct flowi6 *fl6, int flags)
1108 {
1109         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1110 }
1111
1112 static struct dst_entry *ip6_route_input_lookup(struct net *net,
1113                                                 struct net_device *dev,
1114                                                 struct flowi6 *fl6, int flags)
1115 {
1116         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1117                 flags |= RT6_LOOKUP_F_IFACE;
1118
1119         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1120 }
1121
1122 void ip6_route_input(struct sk_buff *skb)
1123 {
1124         const struct ipv6hdr *iph = ipv6_hdr(skb);
1125         struct net *net = dev_net(skb->dev);
1126         int flags = RT6_LOOKUP_F_HAS_SADDR;
1127         struct flowi6 fl6 = {
1128                 .flowi6_iif = skb->dev->ifindex,
1129                 .daddr = iph->daddr,
1130                 .saddr = iph->saddr,
1131                 .flowlabel = ip6_flowinfo(iph),
1132                 .flowi6_mark = skb->mark,
1133                 .flowi6_proto = iph->nexthdr,
1134         };
1135
1136         skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1137 }
1138
1139 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1140                                              struct flowi6 *fl6, int flags)
1141 {
1142         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1143 }
1144
1145 struct dst_entry *ip6_route_output(struct net *net, const struct sock *sk,
1146                                     struct flowi6 *fl6)
1147 {
1148         int flags = 0;
1149
1150         fl6->flowi6_iif = LOOPBACK_IFINDEX;
1151
1152         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
1153                 flags |= RT6_LOOKUP_F_IFACE;
1154
1155         if (!ipv6_addr_any(&fl6->saddr))
1156                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1157         else if (sk)
1158                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1159
1160         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1161 }
1162 EXPORT_SYMBOL(ip6_route_output);
1163
1164 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1165 {
1166         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1167         struct dst_entry *new = NULL;
1168
1169         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0);
1170         if (rt) {
1171                 new = &rt->dst;
1172
1173                 memset(new + 1, 0, sizeof(*rt) - sizeof(*new));
1174
1175                 new->__use = 1;
1176                 new->input = dst_discard;
1177                 new->output = dst_discard_sk;
1178
1179                 if (dst_metrics_read_only(&ort->dst))
1180                         new->_metrics = ort->dst._metrics;
1181                 else
1182                         dst_copy_metrics(new, &ort->dst);
1183                 rt->rt6i_idev = ort->rt6i_idev;
1184                 if (rt->rt6i_idev)
1185                         in6_dev_hold(rt->rt6i_idev);
1186
1187                 rt->rt6i_gateway = ort->rt6i_gateway;
1188                 rt->rt6i_flags = ort->rt6i_flags;
1189                 rt->rt6i_metric = 0;
1190
1191                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1192 #ifdef CONFIG_IPV6_SUBTREES
1193                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1194 #endif
1195
1196                 dst_free(new);
1197         }
1198
1199         dst_release(dst_orig);
1200         return new ? new : ERR_PTR(-ENOMEM);
1201 }
1202
1203 /*
1204  *      Destination cache support functions
1205  */
1206
1207 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1208 {
1209         if (rt->dst.from &&
1210             dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1211                 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1212 }
1213
1214 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1215 {
1216         if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
1217                 return NULL;
1218
1219         if (rt6_check_expired(rt))
1220                 return NULL;
1221
1222         return &rt->dst;
1223 }
1224
1225 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1226 {
1227         if (rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1228             rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1229                 return &rt->dst;
1230         else
1231                 return NULL;
1232 }
1233
1234 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1235 {
1236         struct rt6_info *rt;
1237
1238         rt = (struct rt6_info *) dst;
1239
1240         /* All IPV6 dsts are created with ->obsolete set to the value
1241          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1242          * into this function always.
1243          */
1244
1245         rt6_dst_from_metrics_check(rt);
1246
1247         if ((rt->rt6i_flags & RTF_PCPU) || unlikely(dst->flags & DST_NOCACHE))
1248                 return rt6_dst_from_check(rt, cookie);
1249         else
1250                 return rt6_check(rt, cookie);
1251 }
1252
1253 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1254 {
1255         struct rt6_info *rt = (struct rt6_info *) dst;
1256
1257         if (rt) {
1258                 if (rt->rt6i_flags & RTF_CACHE) {
1259                         if (rt6_check_expired(rt)) {
1260                                 ip6_del_rt(rt);
1261                                 dst = NULL;
1262                         }
1263                 } else {
1264                         dst_release(dst);
1265                         dst = NULL;
1266                 }
1267         }
1268         return dst;
1269 }
1270
1271 static void ip6_link_failure(struct sk_buff *skb)
1272 {
1273         struct rt6_info *rt;
1274
1275         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1276
1277         rt = (struct rt6_info *) skb_dst(skb);
1278         if (rt) {
1279                 if (rt->rt6i_flags & RTF_CACHE) {
1280                         dst_hold(&rt->dst);
1281                         if (ip6_del_rt(rt))
1282                                 dst_free(&rt->dst);
1283                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) {
1284                         rt->rt6i_node->fn_sernum = -1;
1285                 }
1286         }
1287 }
1288
1289 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
1290 {
1291         struct net *net = dev_net(rt->dst.dev);
1292
1293         rt->rt6i_flags |= RTF_MODIFIED;
1294         rt->rt6i_pmtu = mtu;
1295         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1296 }
1297
1298 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
1299                                  const struct ipv6hdr *iph, u32 mtu)
1300 {
1301         struct rt6_info *rt6 = (struct rt6_info *)dst;
1302
1303         if (rt6->rt6i_flags & RTF_LOCAL)
1304                 return;
1305
1306         dst_confirm(dst);
1307         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
1308         if (mtu >= dst_mtu(dst))
1309                 return;
1310
1311         if (rt6->rt6i_flags & RTF_CACHE) {
1312                 rt6_do_update_pmtu(rt6, mtu);
1313         } else {
1314                 const struct in6_addr *daddr, *saddr;
1315                 struct rt6_info *nrt6;
1316
1317                 if (iph) {
1318                         daddr = &iph->daddr;
1319                         saddr = &iph->saddr;
1320                 } else if (sk) {
1321                         daddr = &sk->sk_v6_daddr;
1322                         saddr = &inet6_sk(sk)->saddr;
1323                 } else {
1324                         return;
1325                 }
1326                 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
1327                 if (nrt6) {
1328                         rt6_do_update_pmtu(nrt6, mtu);
1329
1330                         /* ip6_ins_rt(nrt6) will bump the
1331                          * rt6->rt6i_node->fn_sernum
1332                          * which will fail the next rt6_check() and
1333                          * invalidate the sk->sk_dst_cache.
1334                          */
1335                         ip6_ins_rt(nrt6);
1336                 }
1337         }
1338 }
1339
1340 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1341                                struct sk_buff *skb, u32 mtu)
1342 {
1343         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
1344 }
1345
1346 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1347                      int oif, u32 mark)
1348 {
1349         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1350         struct dst_entry *dst;
1351         struct flowi6 fl6;
1352
1353         memset(&fl6, 0, sizeof(fl6));
1354         fl6.flowi6_oif = oif;
1355         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
1356         fl6.daddr = iph->daddr;
1357         fl6.saddr = iph->saddr;
1358         fl6.flowlabel = ip6_flowinfo(iph);
1359
1360         dst = ip6_route_output(net, NULL, &fl6);
1361         if (!dst->error)
1362                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
1363         dst_release(dst);
1364 }
1365 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1366
1367 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1368 {
1369         ip6_update_pmtu(skb, sock_net(sk), mtu,
1370                         sk->sk_bound_dev_if, sk->sk_mark);
1371 }
1372 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1373
1374 /* Handle redirects */
1375 struct ip6rd_flowi {
1376         struct flowi6 fl6;
1377         struct in6_addr gateway;
1378 };
1379
1380 static struct rt6_info *__ip6_route_redirect(struct net *net,
1381                                              struct fib6_table *table,
1382                                              struct flowi6 *fl6,
1383                                              int flags)
1384 {
1385         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1386         struct rt6_info *rt;
1387         struct fib6_node *fn;
1388
1389         /* Get the "current" route for this destination and
1390          * check if the redirect has come from approriate router.
1391          *
1392          * RFC 4861 specifies that redirects should only be
1393          * accepted if they come from the nexthop to the target.
1394          * Due to the way the routes are chosen, this notion
1395          * is a bit fuzzy and one might need to check all possible
1396          * routes.
1397          */
1398
1399         read_lock_bh(&table->tb6_lock);
1400         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1401 restart:
1402         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1403                 if (rt6_check_expired(rt))
1404                         continue;
1405                 if (rt->dst.error)
1406                         break;
1407                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1408                         continue;
1409                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1410                         continue;
1411                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1412                         continue;
1413                 break;
1414         }
1415
1416         if (!rt)
1417                 rt = net->ipv6.ip6_null_entry;
1418         else if (rt->dst.error) {
1419                 rt = net->ipv6.ip6_null_entry;
1420                 goto out;
1421         }
1422
1423         if (rt == net->ipv6.ip6_null_entry) {
1424                 fn = fib6_backtrack(fn, &fl6->saddr);
1425                 if (fn)
1426                         goto restart;
1427         }
1428
1429 out:
1430         dst_hold(&rt->dst);
1431
1432         read_unlock_bh(&table->tb6_lock);
1433
1434         return rt;
1435 };
1436
1437 static struct dst_entry *ip6_route_redirect(struct net *net,
1438                                         const struct flowi6 *fl6,
1439                                         const struct in6_addr *gateway)
1440 {
1441         int flags = RT6_LOOKUP_F_HAS_SADDR;
1442         struct ip6rd_flowi rdfl;
1443
1444         rdfl.fl6 = *fl6;
1445         rdfl.gateway = *gateway;
1446
1447         return fib6_rule_lookup(net, &rdfl.fl6,
1448                                 flags, __ip6_route_redirect);
1449 }
1450
1451 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark)
1452 {
1453         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1454         struct dst_entry *dst;
1455         struct flowi6 fl6;
1456
1457         memset(&fl6, 0, sizeof(fl6));
1458         fl6.flowi6_iif = LOOPBACK_IFINDEX;
1459         fl6.flowi6_oif = oif;
1460         fl6.flowi6_mark = mark;
1461         fl6.daddr = iph->daddr;
1462         fl6.saddr = iph->saddr;
1463         fl6.flowlabel = ip6_flowinfo(iph);
1464
1465         dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
1466         rt6_do_redirect(dst, NULL, skb);
1467         dst_release(dst);
1468 }
1469 EXPORT_SYMBOL_GPL(ip6_redirect);
1470
1471 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
1472                             u32 mark)
1473 {
1474         const struct ipv6hdr *iph = ipv6_hdr(skb);
1475         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
1476         struct dst_entry *dst;
1477         struct flowi6 fl6;
1478
1479         memset(&fl6, 0, sizeof(fl6));
1480         fl6.flowi6_iif = LOOPBACK_IFINDEX;
1481         fl6.flowi6_oif = oif;
1482         fl6.flowi6_mark = mark;
1483         fl6.daddr = msg->dest;
1484         fl6.saddr = iph->daddr;
1485
1486         dst = ip6_route_redirect(net, &fl6, &iph->saddr);
1487         rt6_do_redirect(dst, NULL, skb);
1488         dst_release(dst);
1489 }
1490
1491 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
1492 {
1493         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark);
1494 }
1495 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
1496
1497 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1498 {
1499         struct net_device *dev = dst->dev;
1500         unsigned int mtu = dst_mtu(dst);
1501         struct net *net = dev_net(dev);
1502
1503         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1504
1505         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1506                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1507
1508         /*
1509          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1510          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1511          * IPV6_MAXPLEN is also valid and means: "any MSS,
1512          * rely only on pmtu discovery"
1513          */
1514         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1515                 mtu = IPV6_MAXPLEN;
1516         return mtu;
1517 }
1518
1519 static unsigned int ip6_mtu(const struct dst_entry *dst)
1520 {
1521         const struct rt6_info *rt = (const struct rt6_info *)dst;
1522         unsigned int mtu = rt->rt6i_pmtu;
1523         struct inet6_dev *idev;
1524
1525         if (mtu)
1526                 goto out;
1527
1528         mtu = dst_metric_raw(dst, RTAX_MTU);
1529         if (mtu)
1530                 goto out;
1531
1532         mtu = IPV6_MIN_MTU;
1533
1534         rcu_read_lock();
1535         idev = __in6_dev_get(dst->dev);
1536         if (idev)
1537                 mtu = idev->cnf.mtu6;
1538         rcu_read_unlock();
1539
1540 out:
1541         return min_t(unsigned int, mtu, IP6_MAX_MTU);
1542 }
1543
1544 static struct dst_entry *icmp6_dst_gc_list;
1545 static DEFINE_SPINLOCK(icmp6_dst_lock);
1546
1547 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1548                                   struct flowi6 *fl6)
1549 {
1550         struct dst_entry *dst;
1551         struct rt6_info *rt;
1552         struct inet6_dev *idev = in6_dev_get(dev);
1553         struct net *net = dev_net(dev);
1554
1555         if (unlikely(!idev))
1556                 return ERR_PTR(-ENODEV);
1557
1558         rt = ip6_dst_alloc(net, dev, 0, NULL);
1559         if (unlikely(!rt)) {
1560                 in6_dev_put(idev);
1561                 dst = ERR_PTR(-ENOMEM);
1562                 goto out;
1563         }
1564
1565         rt->dst.flags |= DST_HOST;
1566         rt->dst.output  = ip6_output;
1567         atomic_set(&rt->dst.__refcnt, 1);
1568         rt->rt6i_gateway  = fl6->daddr;
1569         rt->rt6i_dst.addr = fl6->daddr;
1570         rt->rt6i_dst.plen = 128;
1571         rt->rt6i_idev     = idev;
1572         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1573
1574         spin_lock_bh(&icmp6_dst_lock);
1575         rt->dst.next = icmp6_dst_gc_list;
1576         icmp6_dst_gc_list = &rt->dst;
1577         spin_unlock_bh(&icmp6_dst_lock);
1578
1579         fib6_force_start_gc(net);
1580
1581         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1582
1583 out:
1584         return dst;
1585 }
1586
1587 int icmp6_dst_gc(void)
1588 {
1589         struct dst_entry *dst, **pprev;
1590         int more = 0;
1591
1592         spin_lock_bh(&icmp6_dst_lock);
1593         pprev = &icmp6_dst_gc_list;
1594
1595         while ((dst = *pprev) != NULL) {
1596                 if (!atomic_read(&dst->__refcnt)) {
1597                         *pprev = dst->next;
1598                         dst_free(dst);
1599                 } else {
1600                         pprev = &dst->next;
1601                         ++more;
1602                 }
1603         }
1604
1605         spin_unlock_bh(&icmp6_dst_lock);
1606
1607         return more;
1608 }
1609
1610 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1611                             void *arg)
1612 {
1613         struct dst_entry *dst, **pprev;
1614
1615         spin_lock_bh(&icmp6_dst_lock);
1616         pprev = &icmp6_dst_gc_list;
1617         while ((dst = *pprev) != NULL) {
1618                 struct rt6_info *rt = (struct rt6_info *) dst;
1619                 if (func(rt, arg)) {
1620                         *pprev = dst->next;
1621                         dst_free(dst);
1622                 } else {
1623                         pprev = &dst->next;
1624                 }
1625         }
1626         spin_unlock_bh(&icmp6_dst_lock);
1627 }
1628
1629 static int ip6_dst_gc(struct dst_ops *ops)
1630 {
1631         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1632         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1633         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1634         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1635         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1636         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1637         int entries;
1638
1639         entries = dst_entries_get_fast(ops);
1640         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
1641             entries <= rt_max_size)
1642                 goto out;
1643
1644         net->ipv6.ip6_rt_gc_expire++;
1645         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
1646         entries = dst_entries_get_slow(ops);
1647         if (entries < ops->gc_thresh)
1648                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1649 out:
1650         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1651         return entries > rt_max_size;
1652 }
1653
1654 static int ip6_convert_metrics(struct mx6_config *mxc,
1655                                const struct fib6_config *cfg)
1656 {
1657         struct nlattr *nla;
1658         int remaining;
1659         u32 *mp;
1660
1661         if (!cfg->fc_mx)
1662                 return 0;
1663
1664         mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1665         if (unlikely(!mp))
1666                 return -ENOMEM;
1667
1668         nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1669                 int type = nla_type(nla);
1670
1671                 if (type) {
1672                         u32 val;
1673
1674                         if (unlikely(type > RTAX_MAX))
1675                                 goto err;
1676                         if (type == RTAX_CC_ALGO) {
1677                                 char tmp[TCP_CA_NAME_MAX];
1678
1679                                 nla_strlcpy(tmp, nla, sizeof(tmp));
1680                                 val = tcp_ca_get_key_by_name(tmp);
1681                                 if (val == TCP_CA_UNSPEC)
1682                                         goto err;
1683                         } else {
1684                                 val = nla_get_u32(nla);
1685                         }
1686
1687                         mp[type - 1] = val;
1688                         __set_bit(type - 1, mxc->mx_valid);
1689                 }
1690         }
1691
1692         mxc->mx = mp;
1693
1694         return 0;
1695  err:
1696         kfree(mp);
1697         return -EINVAL;
1698 }
1699
1700 int ip6_route_add(struct fib6_config *cfg)
1701 {
1702         int err;
1703         struct net *net = cfg->fc_nlinfo.nl_net;
1704         struct rt6_info *rt = NULL;
1705         struct net_device *dev = NULL;
1706         struct inet6_dev *idev = NULL;
1707         struct fib6_table *table;
1708         struct mx6_config mxc = { .mx = NULL, };
1709         int addr_type;
1710
1711         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1712                 return -EINVAL;
1713 #ifndef CONFIG_IPV6_SUBTREES
1714         if (cfg->fc_src_len)
1715                 return -EINVAL;
1716 #endif
1717         if (cfg->fc_ifindex) {
1718                 err = -ENODEV;
1719                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1720                 if (!dev)
1721                         goto out;
1722                 idev = in6_dev_get(dev);
1723                 if (!idev)
1724                         goto out;
1725         }
1726
1727         if (cfg->fc_metric == 0)
1728                 cfg->fc_metric = IP6_RT_PRIO_USER;
1729
1730         err = -ENOBUFS;
1731         if (cfg->fc_nlinfo.nlh &&
1732             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1733                 table = fib6_get_table(net, cfg->fc_table);
1734                 if (!table) {
1735                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1736                         table = fib6_new_table(net, cfg->fc_table);
1737                 }
1738         } else {
1739                 table = fib6_new_table(net, cfg->fc_table);
1740         }
1741
1742         if (!table)
1743                 goto out;
1744
1745         rt = ip6_dst_alloc(net, NULL, (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT, table);
1746
1747         if (!rt) {
1748                 err = -ENOMEM;
1749                 goto out;
1750         }
1751
1752         if (cfg->fc_flags & RTF_EXPIRES)
1753                 rt6_set_expires(rt, jiffies +
1754                                 clock_t_to_jiffies(cfg->fc_expires));
1755         else
1756                 rt6_clean_expires(rt);
1757
1758         if (cfg->fc_protocol == RTPROT_UNSPEC)
1759                 cfg->fc_protocol = RTPROT_BOOT;
1760         rt->rt6i_protocol = cfg->fc_protocol;
1761
1762         addr_type = ipv6_addr_type(&cfg->fc_dst);
1763
1764         if (addr_type & IPV6_ADDR_MULTICAST)
1765                 rt->dst.input = ip6_mc_input;
1766         else if (cfg->fc_flags & RTF_LOCAL)
1767                 rt->dst.input = ip6_input;
1768         else
1769                 rt->dst.input = ip6_forward;
1770
1771         rt->dst.output = ip6_output;
1772
1773         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1774         rt->rt6i_dst.plen = cfg->fc_dst_len;
1775         if (rt->rt6i_dst.plen == 128)
1776                 rt->dst.flags |= DST_HOST;
1777
1778 #ifdef CONFIG_IPV6_SUBTREES
1779         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1780         rt->rt6i_src.plen = cfg->fc_src_len;
1781 #endif
1782
1783         rt->rt6i_metric = cfg->fc_metric;
1784
1785         /* We cannot add true routes via loopback here,
1786            they would result in kernel looping; promote them to reject routes
1787          */
1788         if ((cfg->fc_flags & RTF_REJECT) ||
1789             (dev && (dev->flags & IFF_LOOPBACK) &&
1790              !(addr_type & IPV6_ADDR_LOOPBACK) &&
1791              !(cfg->fc_flags & RTF_LOCAL))) {
1792                 /* hold loopback dev/idev if we haven't done so. */
1793                 if (dev != net->loopback_dev) {
1794                         if (dev) {
1795                                 dev_put(dev);
1796                                 in6_dev_put(idev);
1797                         }
1798                         dev = net->loopback_dev;
1799                         dev_hold(dev);
1800                         idev = in6_dev_get(dev);
1801                         if (!idev) {
1802                                 err = -ENODEV;
1803                                 goto out;
1804                         }
1805                 }
1806                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1807                 switch (cfg->fc_type) {
1808                 case RTN_BLACKHOLE:
1809                         rt->dst.error = -EINVAL;
1810                         rt->dst.output = dst_discard_sk;
1811                         rt->dst.input = dst_discard;
1812                         break;
1813                 case RTN_PROHIBIT:
1814                         rt->dst.error = -EACCES;
1815                         rt->dst.output = ip6_pkt_prohibit_out;
1816                         rt->dst.input = ip6_pkt_prohibit;
1817                         break;
1818                 case RTN_THROW:
1819                 default:
1820                         rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
1821                                         : -ENETUNREACH;
1822                         rt->dst.output = ip6_pkt_discard_out;
1823                         rt->dst.input = ip6_pkt_discard;
1824                         break;
1825                 }
1826                 goto install_route;
1827         }
1828
1829         if (cfg->fc_flags & RTF_GATEWAY) {
1830                 const struct in6_addr *gw_addr;
1831                 int gwa_type;
1832
1833                 gw_addr = &cfg->fc_gateway;
1834                 gwa_type = ipv6_addr_type(gw_addr);
1835
1836                 /* if gw_addr is local we will fail to detect this in case
1837                  * address is still TENTATIVE (DAD in progress). rt6_lookup()
1838                  * will return already-added prefix route via interface that
1839                  * prefix route was assigned to, which might be non-loopback.
1840                  */
1841                 err = -EINVAL;
1842                 if (ipv6_chk_addr_and_flags(net, gw_addr,
1843                                             gwa_type & IPV6_ADDR_LINKLOCAL ?
1844                                             dev : NULL, 0, 0))
1845                         goto out;
1846
1847                 rt->rt6i_gateway = *gw_addr;
1848
1849                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1850                         struct rt6_info *grt;
1851
1852                         /* IPv6 strictly inhibits using not link-local
1853                            addresses as nexthop address.
1854                            Otherwise, router will not able to send redirects.
1855                            It is very good, but in some (rare!) circumstances
1856                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1857                            some exceptions. --ANK
1858                          */
1859                         if (!(gwa_type & IPV6_ADDR_UNICAST))
1860                                 goto out;
1861
1862                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1863
1864                         err = -EHOSTUNREACH;
1865                         if (!grt)
1866                                 goto out;
1867                         if (dev) {
1868                                 if (dev != grt->dst.dev) {
1869                                         ip6_rt_put(grt);
1870                                         goto out;
1871                                 }
1872                         } else {
1873                                 dev = grt->dst.dev;
1874                                 idev = grt->rt6i_idev;
1875                                 dev_hold(dev);
1876                                 in6_dev_hold(grt->rt6i_idev);
1877                         }
1878                         if (!(grt->rt6i_flags & RTF_GATEWAY))
1879                                 err = 0;
1880                         ip6_rt_put(grt);
1881
1882                         if (err)
1883                                 goto out;
1884                 }
1885                 err = -EINVAL;
1886                 if (!dev || (dev->flags & IFF_LOOPBACK))
1887                         goto out;
1888         }
1889
1890         err = -ENODEV;
1891         if (!dev)
1892                 goto out;
1893
1894         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1895                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1896                         err = -EINVAL;
1897                         goto out;
1898                 }
1899                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1900                 rt->rt6i_prefsrc.plen = 128;
1901         } else
1902                 rt->rt6i_prefsrc.plen = 0;
1903
1904         rt->rt6i_flags = cfg->fc_flags;
1905
1906 install_route:
1907         rt->dst.dev = dev;
1908         rt->rt6i_idev = idev;
1909         rt->rt6i_table = table;
1910
1911         cfg->fc_nlinfo.nl_net = dev_net(dev);
1912
1913         err = ip6_convert_metrics(&mxc, cfg);
1914         if (err)
1915                 goto out;
1916
1917         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc);
1918
1919         kfree(mxc.mx);
1920         return err;
1921 out:
1922         if (dev)
1923                 dev_put(dev);
1924         if (idev)
1925                 in6_dev_put(idev);
1926         if (rt)
1927                 dst_free(&rt->dst);
1928         return err;
1929 }
1930
1931 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1932 {
1933         int err;
1934         struct fib6_table *table;
1935         struct net *net = dev_net(rt->dst.dev);
1936
1937         if (rt == net->ipv6.ip6_null_entry) {
1938                 err = -ENOENT;
1939                 goto out;
1940         }
1941
1942         table = rt->rt6i_table;
1943         write_lock_bh(&table->tb6_lock);
1944         err = fib6_del(rt, info);
1945         write_unlock_bh(&table->tb6_lock);
1946
1947 out:
1948         ip6_rt_put(rt);
1949         return err;
1950 }
1951
1952 int ip6_del_rt(struct rt6_info *rt)
1953 {
1954         struct nl_info info = {
1955                 .nl_net = dev_net(rt->dst.dev),
1956         };
1957         return __ip6_del_rt(rt, &info);
1958 }
1959
1960 static int ip6_route_del(struct fib6_config *cfg)
1961 {
1962         struct fib6_table *table;
1963         struct fib6_node *fn;
1964         struct rt6_info *rt;
1965         int err = -ESRCH;
1966
1967         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1968         if (!table)
1969                 return err;
1970
1971         read_lock_bh(&table->tb6_lock);
1972
1973         fn = fib6_locate(&table->tb6_root,
1974                          &cfg->fc_dst, cfg->fc_dst_len,
1975                          &cfg->fc_src, cfg->fc_src_len);
1976
1977         if (fn) {
1978                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1979                         if ((rt->rt6i_flags & RTF_CACHE) &&
1980                             !(cfg->fc_flags & RTF_CACHE))
1981                                 continue;
1982                         if (cfg->fc_ifindex &&
1983                             (!rt->dst.dev ||
1984                              rt->dst.dev->ifindex != cfg->fc_ifindex))
1985                                 continue;
1986                         if (cfg->fc_flags & RTF_GATEWAY &&
1987                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1988                                 continue;
1989                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1990                                 continue;
1991                         dst_hold(&rt->dst);
1992                         read_unlock_bh(&table->tb6_lock);
1993
1994                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1995                 }
1996         }
1997         read_unlock_bh(&table->tb6_lock);
1998
1999         return err;
2000 }
2001
2002 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2003 {
2004         struct net *net = dev_net(skb->dev);
2005         struct netevent_redirect netevent;
2006         struct rt6_info *rt, *nrt = NULL;
2007         struct ndisc_options ndopts;
2008         struct inet6_dev *in6_dev;
2009         struct neighbour *neigh;
2010         struct rd_msg *msg;
2011         int optlen, on_link;
2012         u8 *lladdr;
2013
2014         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2015         optlen -= sizeof(*msg);
2016
2017         if (optlen < 0) {
2018                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2019                 return;
2020         }
2021
2022         msg = (struct rd_msg *)icmp6_hdr(skb);
2023
2024         if (ipv6_addr_is_multicast(&msg->dest)) {
2025                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2026                 return;
2027         }
2028
2029         on_link = 0;
2030         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2031                 on_link = 1;
2032         } else if (ipv6_addr_type(&msg->target) !=
2033                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2034                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2035                 return;
2036         }
2037
2038         in6_dev = __in6_dev_get(skb->dev);
2039         if (!in6_dev)
2040                 return;
2041         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2042                 return;
2043
2044         /* RFC2461 8.1:
2045          *      The IP source address of the Redirect MUST be the same as the current
2046          *      first-hop router for the specified ICMP Destination Address.
2047          */
2048
2049         if (!ndisc_parse_options(msg->opt, optlen, &ndopts)) {
2050                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2051                 return;
2052         }
2053
2054         lladdr = NULL;
2055         if (ndopts.nd_opts_tgt_lladdr) {
2056                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2057                                              skb->dev);
2058                 if (!lladdr) {
2059                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2060                         return;
2061                 }
2062         }
2063
2064         rt = (struct rt6_info *) dst;
2065         if (rt == net->ipv6.ip6_null_entry) {
2066                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2067                 return;
2068         }
2069
2070         /* Redirect received -> path was valid.
2071          * Look, redirects are sent only in response to data packets,
2072          * so that this nexthop apparently is reachable. --ANK
2073          */
2074         dst_confirm(&rt->dst);
2075
2076         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2077         if (!neigh)
2078                 return;
2079
2080         /*
2081          *      We have finally decided to accept it.
2082          */
2083
2084         neigh_update(neigh, lladdr, NUD_STALE,
2085                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
2086                      NEIGH_UPDATE_F_OVERRIDE|
2087                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2088                                      NEIGH_UPDATE_F_ISROUTER))
2089                      );
2090
2091         nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2092         if (!nrt)
2093                 goto out;
2094
2095         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
2096         if (on_link)
2097                 nrt->rt6i_flags &= ~RTF_GATEWAY;
2098
2099         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
2100
2101         if (ip6_ins_rt(nrt))
2102                 goto out;
2103
2104         netevent.old = &rt->dst;
2105         netevent.new = &nrt->dst;
2106         netevent.daddr = &msg->dest;
2107         netevent.neigh = neigh;
2108         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
2109
2110         if (rt->rt6i_flags & RTF_CACHE) {
2111                 rt = (struct rt6_info *) dst_clone(&rt->dst);
2112                 ip6_del_rt(rt);
2113         }
2114
2115 out:
2116         neigh_release(neigh);
2117 }
2118
2119 /*
2120  *      Misc support functions
2121  */
2122
2123 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
2124 {
2125         BUG_ON(from->dst.from);
2126
2127         rt->rt6i_flags &= ~RTF_EXPIRES;
2128         dst_hold(&from->dst);
2129         rt->dst.from = &from->dst;
2130         dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
2131 }
2132
2133 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
2134 {
2135         rt->dst.input = ort->dst.input;
2136         rt->dst.output = ort->dst.output;
2137         rt->rt6i_dst = ort->rt6i_dst;
2138         rt->dst.error = ort->dst.error;
2139         rt->rt6i_idev = ort->rt6i_idev;
2140         if (rt->rt6i_idev)
2141                 in6_dev_hold(rt->rt6i_idev);
2142         rt->dst.lastuse = jiffies;
2143         rt->rt6i_gateway = ort->rt6i_gateway;
2144         rt->rt6i_flags = ort->rt6i_flags;
2145         rt6_set_from(rt, ort);
2146         rt->rt6i_metric = ort->rt6i_metric;
2147 #ifdef CONFIG_IPV6_SUBTREES
2148         rt->rt6i_src = ort->rt6i_src;
2149 #endif
2150         rt->rt6i_prefsrc = ort->rt6i_prefsrc;
2151         rt->rt6i_table = ort->rt6i_table;
2152 }
2153
2154 #ifdef CONFIG_IPV6_ROUTE_INFO
2155 static struct rt6_info *rt6_get_route_info(struct net *net,
2156                                            const struct in6_addr *prefix, int prefixlen,
2157                                            const struct in6_addr *gwaddr, int ifindex)
2158 {
2159         struct fib6_node *fn;
2160         struct rt6_info *rt = NULL;
2161         struct fib6_table *table;
2162
2163         table = fib6_get_table(net, RT6_TABLE_INFO);
2164         if (!table)
2165                 return NULL;
2166
2167         read_lock_bh(&table->tb6_lock);
2168         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0);
2169         if (!fn)
2170                 goto out;
2171
2172         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2173                 if (rt->dst.dev->ifindex != ifindex)
2174                         continue;
2175                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
2176                         continue;
2177                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
2178                         continue;
2179                 dst_hold(&rt->dst);
2180                 break;
2181         }
2182 out:
2183         read_unlock_bh(&table->tb6_lock);
2184         return rt;
2185 }
2186
2187 static struct rt6_info *rt6_add_route_info(struct net *net,
2188                                            const struct in6_addr *prefix, int prefixlen,
2189                                            const struct in6_addr *gwaddr, int ifindex,
2190                                            unsigned int pref)
2191 {
2192         struct fib6_config cfg = {
2193                 .fc_table       = RT6_TABLE_INFO,
2194                 .fc_metric      = IP6_RT_PRIO_USER,
2195                 .fc_ifindex     = ifindex,
2196                 .fc_dst_len     = prefixlen,
2197                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
2198                                   RTF_UP | RTF_PREF(pref),
2199                 .fc_nlinfo.portid = 0,
2200                 .fc_nlinfo.nlh = NULL,
2201                 .fc_nlinfo.nl_net = net,
2202         };
2203
2204         cfg.fc_dst = *prefix;
2205         cfg.fc_gateway = *gwaddr;
2206
2207         /* We should treat it as a default route if prefix length is 0. */
2208         if (!prefixlen)
2209                 cfg.fc_flags |= RTF_DEFAULT;
2210
2211         ip6_route_add(&cfg);
2212
2213         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
2214 }
2215 #endif
2216
2217 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
2218 {
2219         struct rt6_info *rt;
2220         struct fib6_table *table;
2221
2222         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
2223         if (!table)
2224                 return NULL;
2225
2226         read_lock_bh(&table->tb6_lock);
2227         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2228                 if (dev == rt->dst.dev &&
2229                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
2230                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
2231                         break;
2232         }
2233         if (rt)
2234                 dst_hold(&rt->dst);
2235         read_unlock_bh(&table->tb6_lock);
2236         return rt;
2237 }
2238
2239 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
2240                                      struct net_device *dev,
2241                                      unsigned int pref)
2242 {
2243         struct fib6_config cfg = {
2244                 .fc_table       = RT6_TABLE_DFLT,
2245                 .fc_metric      = IP6_RT_PRIO_USER,
2246                 .fc_ifindex     = dev->ifindex,
2247                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
2248                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
2249                 .fc_nlinfo.portid = 0,
2250                 .fc_nlinfo.nlh = NULL,
2251                 .fc_nlinfo.nl_net = dev_net(dev),
2252         };
2253
2254         cfg.fc_gateway = *gwaddr;
2255
2256         ip6_route_add(&cfg);
2257
2258         return rt6_get_dflt_router(gwaddr, dev);
2259 }
2260
2261 void rt6_purge_dflt_routers(struct net *net)
2262 {
2263         struct rt6_info *rt;
2264         struct fib6_table *table;
2265
2266         /* NOTE: Keep consistent with rt6_get_dflt_router */
2267         table = fib6_get_table(net, RT6_TABLE_DFLT);
2268         if (!table)
2269                 return;
2270
2271 restart:
2272         read_lock_bh(&table->tb6_lock);
2273         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2274                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
2275                     (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
2276                         dst_hold(&rt->dst);
2277                         read_unlock_bh(&table->tb6_lock);
2278                         ip6_del_rt(rt);
2279                         goto restart;
2280                 }
2281         }
2282         read_unlock_bh(&table->tb6_lock);
2283 }
2284
2285 static void rtmsg_to_fib6_config(struct net *net,
2286                                  struct in6_rtmsg *rtmsg,
2287                                  struct fib6_config *cfg)
2288 {
2289         memset(cfg, 0, sizeof(*cfg));
2290
2291         cfg->fc_table = RT6_TABLE_MAIN;
2292         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
2293         cfg->fc_metric = rtmsg->rtmsg_metric;
2294         cfg->fc_expires = rtmsg->rtmsg_info;
2295         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
2296         cfg->fc_src_len = rtmsg->rtmsg_src_len;
2297         cfg->fc_flags = rtmsg->rtmsg_flags;
2298
2299         cfg->fc_nlinfo.nl_net = net;
2300
2301         cfg->fc_dst = rtmsg->rtmsg_dst;
2302         cfg->fc_src = rtmsg->rtmsg_src;
2303         cfg->fc_gateway = rtmsg->rtmsg_gateway;
2304 }
2305
2306 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2307 {
2308         struct fib6_config cfg;
2309         struct in6_rtmsg rtmsg;
2310         int err;
2311
2312         switch (cmd) {
2313         case SIOCADDRT:         /* Add a route */
2314         case SIOCDELRT:         /* Delete a route */
2315                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
2316                         return -EPERM;
2317                 err = copy_from_user(&rtmsg, arg,
2318                                      sizeof(struct in6_rtmsg));
2319                 if (err)
2320                         return -EFAULT;
2321
2322                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2323
2324                 rtnl_lock();
2325                 switch (cmd) {
2326                 case SIOCADDRT:
2327                         err = ip6_route_add(&cfg);
2328                         break;
2329                 case SIOCDELRT:
2330                         err = ip6_route_del(&cfg);
2331                         break;
2332                 default:
2333                         err = -EINVAL;
2334                 }
2335                 rtnl_unlock();
2336
2337                 return err;
2338         }
2339
2340         return -EINVAL;
2341 }
2342
2343 /*
2344  *      Drop the packet on the floor
2345  */
2346
2347 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2348 {
2349         int type;
2350         struct dst_entry *dst = skb_dst(skb);
2351         switch (ipstats_mib_noroutes) {
2352         case IPSTATS_MIB_INNOROUTES:
2353                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2354                 if (type == IPV6_ADDR_ANY) {
2355                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2356                                       IPSTATS_MIB_INADDRERRORS);
2357                         break;
2358                 }
2359                 /* FALLTHROUGH */
2360         case IPSTATS_MIB_OUTNOROUTES:
2361                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2362                               ipstats_mib_noroutes);
2363                 break;
2364         }
2365         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2366         kfree_skb(skb);
2367         return 0;
2368 }
2369
2370 static int ip6_pkt_discard(struct sk_buff *skb)
2371 {
2372         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2373 }
2374
2375 static int ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb)
2376 {
2377         skb->dev = skb_dst(skb)->dev;
2378         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2379 }
2380
2381 static int ip6_pkt_prohibit(struct sk_buff *skb)
2382 {
2383         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2384 }
2385
2386 static int ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb)
2387 {
2388         skb->dev = skb_dst(skb)->dev;
2389         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2390 }
2391
2392 /*
2393  *      Allocate a dst for local (unicast / anycast) address.
2394  */
2395
2396 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2397                                     const struct in6_addr *addr,
2398                                     bool anycast)
2399 {
2400         struct net *net = dev_net(idev->dev);
2401         struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev,
2402                                             DST_NOCOUNT, NULL);
2403         if (!rt)
2404                 return ERR_PTR(-ENOMEM);
2405
2406         in6_dev_hold(idev);
2407
2408         rt->dst.flags |= DST_HOST;
2409         rt->dst.input = ip6_input;
2410         rt->dst.output = ip6_output;
2411         rt->rt6i_idev = idev;
2412
2413         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2414         if (anycast)
2415                 rt->rt6i_flags |= RTF_ANYCAST;
2416         else
2417                 rt->rt6i_flags |= RTF_LOCAL;
2418
2419         rt->rt6i_gateway  = *addr;
2420         rt->rt6i_dst.addr = *addr;
2421         rt->rt6i_dst.plen = 128;
2422         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2423
2424         atomic_set(&rt->dst.__refcnt, 1);
2425
2426         return rt;
2427 }
2428
2429 int ip6_route_get_saddr(struct net *net,
2430                         struct rt6_info *rt,
2431                         const struct in6_addr *daddr,
2432                         unsigned int prefs,
2433                         struct in6_addr *saddr)
2434 {
2435         struct inet6_dev *idev =
2436                 rt ? ip6_dst_idev((struct dst_entry *)rt) : NULL;
2437         int err = 0;
2438         if (rt && rt->rt6i_prefsrc.plen)
2439                 *saddr = rt->rt6i_prefsrc.addr;
2440         else
2441                 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2442                                          daddr, prefs, saddr);
2443         return err;
2444 }
2445
2446 /* remove deleted ip from prefsrc entries */
2447 struct arg_dev_net_ip {
2448         struct net_device *dev;
2449         struct net *net;
2450         struct in6_addr *addr;
2451 };
2452
2453 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2454 {
2455         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2456         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2457         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2458
2459         if (((void *)rt->dst.dev == dev || !dev) &&
2460             rt != net->ipv6.ip6_null_entry &&
2461             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2462                 /* remove prefsrc entry */
2463                 rt->rt6i_prefsrc.plen = 0;
2464         }
2465         return 0;
2466 }
2467
2468 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2469 {
2470         struct net *net = dev_net(ifp->idev->dev);
2471         struct arg_dev_net_ip adni = {
2472                 .dev = ifp->idev->dev,
2473                 .net = net,
2474                 .addr = &ifp->addr,
2475         };
2476         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
2477 }
2478
2479 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
2480 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
2481
2482 /* Remove routers and update dst entries when gateway turn into host. */
2483 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
2484 {
2485         struct in6_addr *gateway = (struct in6_addr *)arg;
2486
2487         if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) ||
2488              ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) &&
2489              ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
2490                 return -1;
2491         }
2492         return 0;
2493 }
2494
2495 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
2496 {
2497         fib6_clean_all(net, fib6_clean_tohost, gateway);
2498 }
2499
2500 struct arg_dev_net {
2501         struct net_device *dev;
2502         struct net *net;
2503 };
2504
2505 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2506 {
2507         const struct arg_dev_net *adn = arg;
2508         const struct net_device *dev = adn->dev;
2509
2510         if ((rt->dst.dev == dev || !dev) &&
2511             rt != adn->net->ipv6.ip6_null_entry)
2512                 return -1;
2513
2514         return 0;
2515 }
2516
2517 void rt6_ifdown(struct net *net, struct net_device *dev)
2518 {
2519         struct arg_dev_net adn = {
2520                 .dev = dev,
2521                 .net = net,
2522         };
2523
2524         fib6_clean_all(net, fib6_ifdown, &adn);
2525         icmp6_clean_all(fib6_ifdown, &adn);
2526         rt6_uncached_list_flush_dev(net, dev);
2527 }
2528
2529 struct rt6_mtu_change_arg {
2530         struct net_device *dev;
2531         unsigned int mtu;
2532 };
2533
2534 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2535 {
2536         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2537         struct inet6_dev *idev;
2538
2539         /* In IPv6 pmtu discovery is not optional,
2540            so that RTAX_MTU lock cannot disable it.
2541            We still use this lock to block changes
2542            caused by addrconf/ndisc.
2543         */
2544
2545         idev = __in6_dev_get(arg->dev);
2546         if (!idev)
2547                 return 0;
2548
2549         /* For administrative MTU increase, there is no way to discover
2550            IPv6 PMTU increase, so PMTU increase should be updated here.
2551            Since RFC 1981 doesn't include administrative MTU increase
2552            update PMTU increase is a MUST. (i.e. jumbo frame)
2553          */
2554         /*
2555            If new MTU is less than route PMTU, this new MTU will be the
2556            lowest MTU in the path, update the route PMTU to reflect PMTU
2557            decreases; if new MTU is greater than route PMTU, and the
2558            old MTU is the lowest MTU in the path, update the route PMTU
2559            to reflect the increase. In this case if the other nodes' MTU
2560            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2561            PMTU discouvery.
2562          */
2563         if (rt->dst.dev == arg->dev &&
2564             !dst_metric_locked(&rt->dst, RTAX_MTU)) {
2565                 if (rt->rt6i_flags & RTF_CACHE) {
2566                         /* For RTF_CACHE with rt6i_pmtu == 0
2567                          * (i.e. a redirected route),
2568                          * the metrics of its rt->dst.from has already
2569                          * been updated.
2570                          */
2571                         if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
2572                                 rt->rt6i_pmtu = arg->mtu;
2573                 } else if (dst_mtu(&rt->dst) >= arg->mtu ||
2574                            (dst_mtu(&rt->dst) < arg->mtu &&
2575                             dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
2576                         dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2577                 }
2578         }
2579         return 0;
2580 }
2581
2582 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2583 {
2584         struct rt6_mtu_change_arg arg = {
2585                 .dev = dev,
2586                 .mtu = mtu,
2587         };
2588
2589         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
2590 }
2591
2592 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2593         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2594         [RTA_OIF]               = { .type = NLA_U32 },
2595         [RTA_IIF]               = { .type = NLA_U32 },
2596         [RTA_PRIORITY]          = { .type = NLA_U32 },
2597         [RTA_METRICS]           = { .type = NLA_NESTED },
2598         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
2599         [RTA_PREF]              = { .type = NLA_U8 },
2600 };
2601
2602 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2603                               struct fib6_config *cfg)
2604 {
2605         struct rtmsg *rtm;
2606         struct nlattr *tb[RTA_MAX+1];
2607         unsigned int pref;
2608         int err;
2609
2610         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2611         if (err < 0)
2612                 goto errout;
2613
2614         err = -EINVAL;
2615         rtm = nlmsg_data(nlh);
2616         memset(cfg, 0, sizeof(*cfg));
2617
2618         cfg->fc_table = rtm->rtm_table;
2619         cfg->fc_dst_len = rtm->rtm_dst_len;
2620         cfg->fc_src_len = rtm->rtm_src_len;
2621         cfg->fc_flags = RTF_UP;
2622         cfg->fc_protocol = rtm->rtm_protocol;
2623         cfg->fc_type = rtm->rtm_type;
2624
2625         if (rtm->rtm_type == RTN_UNREACHABLE ||
2626             rtm->rtm_type == RTN_BLACKHOLE ||
2627             rtm->rtm_type == RTN_PROHIBIT ||
2628             rtm->rtm_type == RTN_THROW)
2629                 cfg->fc_flags |= RTF_REJECT;
2630
2631         if (rtm->rtm_type == RTN_LOCAL)
2632                 cfg->fc_flags |= RTF_LOCAL;
2633
2634         if (rtm->rtm_flags & RTM_F_CLONED)
2635                 cfg->fc_flags |= RTF_CACHE;
2636
2637         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
2638         cfg->fc_nlinfo.nlh = nlh;
2639         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2640
2641         if (tb[RTA_GATEWAY]) {
2642                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
2643                 cfg->fc_flags |= RTF_GATEWAY;
2644         }
2645
2646         if (tb[RTA_DST]) {
2647                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2648
2649                 if (nla_len(tb[RTA_DST]) < plen)
2650                         goto errout;
2651
2652                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2653         }
2654
2655         if (tb[RTA_SRC]) {
2656                 int plen = (rtm->rtm_src_len + 7) >> 3;
2657
2658                 if (nla_len(tb[RTA_SRC]) < plen)
2659                         goto errout;
2660
2661                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2662         }
2663
2664         if (tb[RTA_PREFSRC])
2665                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
2666
2667         if (tb[RTA_OIF])
2668                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2669
2670         if (tb[RTA_PRIORITY])
2671                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2672
2673         if (tb[RTA_METRICS]) {
2674                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2675                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2676         }
2677
2678         if (tb[RTA_TABLE])
2679                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2680
2681         if (tb[RTA_MULTIPATH]) {
2682                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
2683                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
2684         }
2685
2686         if (tb[RTA_PREF]) {
2687                 pref = nla_get_u8(tb[RTA_PREF]);
2688                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
2689                     pref != ICMPV6_ROUTER_PREF_HIGH)
2690                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
2691                 cfg->fc_flags |= RTF_PREF(pref);
2692         }
2693
2694         err = 0;
2695 errout:
2696         return err;
2697 }
2698
2699 static int ip6_route_multipath(struct fib6_config *cfg, int add)
2700 {
2701         struct fib6_config r_cfg;
2702         struct rtnexthop *rtnh;
2703         int remaining;
2704         int attrlen;
2705         int err = 0, last_err = 0;
2706
2707         remaining = cfg->fc_mp_len;
2708 beginning:
2709         rtnh = (struct rtnexthop *)cfg->fc_mp;
2710
2711         /* Parse a Multipath Entry */
2712         while (rtnh_ok(rtnh, remaining)) {
2713                 memcpy(&r_cfg, cfg, sizeof(*cfg));
2714                 if (rtnh->rtnh_ifindex)
2715                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
2716
2717                 attrlen = rtnh_attrlen(rtnh);
2718                 if (attrlen > 0) {
2719                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
2720
2721                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
2722                         if (nla) {
2723                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
2724                                 r_cfg.fc_flags |= RTF_GATEWAY;
2725                         }
2726                 }
2727                 err = add ? ip6_route_add(&r_cfg) : ip6_route_del(&r_cfg);
2728                 if (err) {
2729                         last_err = err;
2730                         /* If we are trying to remove a route, do not stop the
2731                          * loop when ip6_route_del() fails (because next hop is
2732                          * already gone), we should try to remove all next hops.
2733                          */
2734                         if (add) {
2735                                 /* If add fails, we should try to delete all
2736                                  * next hops that have been already added.
2737                                  */
2738                                 add = 0;
2739                                 remaining = cfg->fc_mp_len - remaining;
2740                                 goto beginning;
2741                         }
2742                 }
2743                 /* Because each route is added like a single route we remove
2744                  * these flags after the first nexthop: if there is a collision,
2745                  * we have already failed to add the first nexthop:
2746                  * fib6_add_rt2node() has rejected it; when replacing, old
2747                  * nexthops have been replaced by first new, the rest should
2748                  * be added to it.
2749                  */
2750                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
2751                                                      NLM_F_REPLACE);
2752                 rtnh = rtnh_next(rtnh, &remaining);
2753         }
2754
2755         return last_err;
2756 }
2757
2758 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
2759 {
2760         struct fib6_config cfg;
2761         int err;
2762
2763         err = rtm_to_fib6_config(skb, nlh, &cfg);
2764         if (err < 0)
2765                 return err;
2766
2767         if (cfg.fc_mp)
2768                 return ip6_route_multipath(&cfg, 0);
2769         else
2770                 return ip6_route_del(&cfg);
2771 }
2772
2773 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
2774 {
2775         struct fib6_config cfg;
2776         int err;
2777
2778         err = rtm_to_fib6_config(skb, nlh, &cfg);
2779         if (err < 0)
2780                 return err;
2781
2782         if (cfg.fc_mp)
2783                 return ip6_route_multipath(&cfg, 1);
2784         else
2785                 return ip6_route_add(&cfg);
2786 }
2787
2788 static inline size_t rt6_nlmsg_size(void)
2789 {
2790         return NLMSG_ALIGN(sizeof(struct rtmsg))
2791                + nla_total_size(16) /* RTA_SRC */
2792                + nla_total_size(16) /* RTA_DST */
2793                + nla_total_size(16) /* RTA_GATEWAY */
2794                + nla_total_size(16) /* RTA_PREFSRC */
2795                + nla_total_size(4) /* RTA_TABLE */
2796                + nla_total_size(4) /* RTA_IIF */
2797                + nla_total_size(4) /* RTA_OIF */
2798                + nla_total_size(4) /* RTA_PRIORITY */
2799                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2800                + nla_total_size(sizeof(struct rta_cacheinfo))
2801                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
2802                + nla_total_size(1); /* RTA_PREF */
2803 }
2804
2805 static int rt6_fill_node(struct net *net,
2806                          struct sk_buff *skb, struct rt6_info *rt,
2807                          struct in6_addr *dst, struct in6_addr *src,
2808                          int iif, int type, u32 portid, u32 seq,
2809                          int prefix, int nowait, unsigned int flags)
2810 {
2811         u32 metrics[RTAX_MAX];
2812         struct rtmsg *rtm;
2813         struct nlmsghdr *nlh;
2814         long expires;
2815         u32 table;
2816
2817         if (prefix) {   /* user wants prefix routes only */
2818                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2819                         /* success since this is not a prefix route */
2820                         return 1;
2821                 }
2822         }
2823
2824         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
2825         if (!nlh)
2826                 return -EMSGSIZE;
2827
2828         rtm = nlmsg_data(nlh);
2829         rtm->rtm_family = AF_INET6;
2830         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2831         rtm->rtm_src_len = rt->rt6i_src.plen;
2832         rtm->rtm_tos = 0;
2833         if (rt->rt6i_table)
2834                 table = rt->rt6i_table->tb6_id;
2835         else
2836                 table = RT6_TABLE_UNSPEC;
2837         rtm->rtm_table = table;
2838         if (nla_put_u32(skb, RTA_TABLE, table))
2839                 goto nla_put_failure;
2840         if (rt->rt6i_flags & RTF_REJECT) {
2841                 switch (rt->dst.error) {
2842                 case -EINVAL:
2843                         rtm->rtm_type = RTN_BLACKHOLE;
2844                         break;
2845                 case -EACCES:
2846                         rtm->rtm_type = RTN_PROHIBIT;
2847                         break;
2848                 case -EAGAIN:
2849                         rtm->rtm_type = RTN_THROW;
2850                         break;
2851                 default:
2852                         rtm->rtm_type = RTN_UNREACHABLE;
2853                         break;
2854                 }
2855         }
2856         else if (rt->rt6i_flags & RTF_LOCAL)
2857                 rtm->rtm_type = RTN_LOCAL;
2858         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
2859                 rtm->rtm_type = RTN_LOCAL;
2860         else
2861                 rtm->rtm_type = RTN_UNICAST;
2862         rtm->rtm_flags = 0;
2863         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2864         rtm->rtm_protocol = rt->rt6i_protocol;
2865         if (rt->rt6i_flags & RTF_DYNAMIC)
2866                 rtm->rtm_protocol = RTPROT_REDIRECT;
2867         else if (rt->rt6i_flags & RTF_ADDRCONF) {
2868                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ROUTEINFO))
2869                         rtm->rtm_protocol = RTPROT_RA;
2870                 else
2871                         rtm->rtm_protocol = RTPROT_KERNEL;
2872         }
2873
2874         if (rt->rt6i_flags & RTF_CACHE)
2875                 rtm->rtm_flags |= RTM_F_CLONED;
2876
2877         if (dst) {
2878                 if (nla_put_in6_addr(skb, RTA_DST, dst))
2879                         goto nla_put_failure;
2880                 rtm->rtm_dst_len = 128;
2881         } else if (rtm->rtm_dst_len)
2882                 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
2883                         goto nla_put_failure;
2884 #ifdef CONFIG_IPV6_SUBTREES
2885         if (src) {
2886                 if (nla_put_in6_addr(skb, RTA_SRC, src))
2887                         goto nla_put_failure;
2888                 rtm->rtm_src_len = 128;
2889         } else if (rtm->rtm_src_len &&
2890                    nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
2891                 goto nla_put_failure;
2892 #endif
2893         if (iif) {
2894 #ifdef CONFIG_IPV6_MROUTE
2895                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2896                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2897                         if (err <= 0) {
2898                                 if (!nowait) {
2899                                         if (err == 0)
2900                                                 return 0;
2901                                         goto nla_put_failure;
2902                                 } else {
2903                                         if (err == -EMSGSIZE)
2904                                                 goto nla_put_failure;
2905                                 }
2906                         }
2907                 } else
2908 #endif
2909                         if (nla_put_u32(skb, RTA_IIF, iif))
2910                                 goto nla_put_failure;
2911         } else if (dst) {
2912                 struct in6_addr saddr_buf;
2913                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
2914                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
2915                         goto nla_put_failure;
2916         }
2917
2918         if (rt->rt6i_prefsrc.plen) {
2919                 struct in6_addr saddr_buf;
2920                 saddr_buf = rt->rt6i_prefsrc.addr;
2921                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
2922                         goto nla_put_failure;
2923         }
2924
2925         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2926         if (rt->rt6i_pmtu)
2927                 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
2928         if (rtnetlink_put_metrics(skb, metrics) < 0)
2929                 goto nla_put_failure;
2930
2931         if (rt->rt6i_flags & RTF_GATEWAY) {
2932                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
2933                         goto nla_put_failure;
2934         }
2935
2936         if (rt->dst.dev &&
2937             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2938                 goto nla_put_failure;
2939         if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
2940                 goto nla_put_failure;
2941
2942         expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
2943
2944         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
2945                 goto nla_put_failure;
2946
2947         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
2948                 goto nla_put_failure;
2949
2950         nlmsg_end(skb, nlh);
2951         return 0;
2952
2953 nla_put_failure:
2954         nlmsg_cancel(skb, nlh);
2955         return -EMSGSIZE;
2956 }
2957
2958 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2959 {
2960         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2961         int prefix;
2962
2963         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2964                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2965                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2966         } else
2967                 prefix = 0;
2968
2969         return rt6_fill_node(arg->net,
2970                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2971                      NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
2972                      prefix, 0, NLM_F_MULTI);
2973 }
2974
2975 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2976 {
2977         struct net *net = sock_net(in_skb->sk);
2978         struct nlattr *tb[RTA_MAX+1];
2979         struct rt6_info *rt;
2980         struct sk_buff *skb;
2981         struct rtmsg *rtm;
2982         struct flowi6 fl6;
2983         int err, iif = 0, oif = 0;
2984
2985         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2986         if (err < 0)
2987                 goto errout;
2988
2989         err = -EINVAL;
2990         memset(&fl6, 0, sizeof(fl6));
2991
2992         if (tb[RTA_SRC]) {
2993                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2994                         goto errout;
2995
2996                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
2997         }
2998
2999         if (tb[RTA_DST]) {
3000                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
3001                         goto errout;
3002
3003                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
3004         }
3005
3006         if (tb[RTA_IIF])
3007                 iif = nla_get_u32(tb[RTA_IIF]);
3008
3009         if (tb[RTA_OIF])
3010                 oif = nla_get_u32(tb[RTA_OIF]);
3011
3012         if (tb[RTA_MARK])
3013                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
3014
3015         if (iif) {
3016                 struct net_device *dev;
3017                 int flags = 0;
3018
3019                 dev = __dev_get_by_index(net, iif);
3020                 if (!dev) {
3021                         err = -ENODEV;
3022                         goto errout;
3023                 }
3024
3025                 fl6.flowi6_iif = iif;
3026
3027                 if (!ipv6_addr_any(&fl6.saddr))
3028                         flags |= RT6_LOOKUP_F_HAS_SADDR;
3029
3030                 rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
3031                                                                flags);
3032         } else {
3033                 fl6.flowi6_oif = oif;
3034
3035                 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
3036         }
3037
3038         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3039         if (!skb) {
3040                 ip6_rt_put(rt);
3041                 err = -ENOBUFS;
3042                 goto errout;
3043         }
3044
3045         /* Reserve room for dummy headers, this skb can pass
3046            through good chunk of routing engine.
3047          */
3048         skb_reset_mac_header(skb);
3049         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
3050
3051         skb_dst_set(skb, &rt->dst);
3052
3053         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
3054                             RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3055                             nlh->nlmsg_seq, 0, 0, 0);
3056         if (err < 0) {
3057                 kfree_skb(skb);
3058                 goto errout;
3059         }
3060
3061         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3062 errout:
3063         return err;
3064 }
3065
3066 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
3067 {
3068         struct sk_buff *skb;
3069         struct net *net = info->nl_net;
3070         u32 seq;
3071         int err;
3072
3073         err = -ENOBUFS;
3074         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3075
3076         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
3077         if (!skb)
3078                 goto errout;
3079
3080         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
3081                                 event, info->portid, seq, 0, 0, 0);
3082         if (err < 0) {
3083                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
3084                 WARN_ON(err == -EMSGSIZE);
3085                 kfree_skb(skb);
3086                 goto errout;
3087         }
3088         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3089                     info->nlh, gfp_any());
3090         return;
3091 errout:
3092         if (err < 0)
3093                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
3094 }
3095
3096 static int ip6_route_dev_notify(struct notifier_block *this,
3097                                 unsigned long event, void *ptr)
3098 {
3099         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3100         struct net *net = dev_net(dev);
3101
3102         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
3103                 net->ipv6.ip6_null_entry->dst.dev = dev;
3104                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
3105 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3106                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
3107                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
3108                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
3109                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
3110 #endif
3111         }
3112
3113         return NOTIFY_OK;
3114 }
3115
3116 /*
3117  *      /proc
3118  */
3119
3120 #ifdef CONFIG_PROC_FS
3121
3122 static const struct file_operations ipv6_route_proc_fops = {
3123         .owner          = THIS_MODULE,
3124         .open           = ipv6_route_open,
3125         .read           = seq_read,
3126         .llseek         = seq_lseek,
3127         .release        = seq_release_net,
3128 };
3129
3130 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
3131 {
3132         struct net *net = (struct net *)seq->private;
3133         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
3134                    net->ipv6.rt6_stats->fib_nodes,
3135                    net->ipv6.rt6_stats->fib_route_nodes,
3136                    net->ipv6.rt6_stats->fib_rt_alloc,
3137                    net->ipv6.rt6_stats->fib_rt_entries,
3138                    net->ipv6.rt6_stats->fib_rt_cache,
3139                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
3140                    net->ipv6.rt6_stats->fib_discarded_routes);
3141
3142         return 0;
3143 }
3144
3145 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
3146 {
3147         return single_open_net(inode, file, rt6_stats_seq_show);
3148 }
3149
3150 static const struct file_operations rt6_stats_seq_fops = {
3151         .owner   = THIS_MODULE,
3152         .open    = rt6_stats_seq_open,
3153         .read    = seq_read,
3154         .llseek  = seq_lseek,
3155         .release = single_release_net,
3156 };
3157 #endif  /* CONFIG_PROC_FS */
3158
3159 #ifdef CONFIG_SYSCTL
3160
3161 static
3162 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
3163                               void __user *buffer, size_t *lenp, loff_t *ppos)
3164 {
3165         struct net *net;
3166         int delay;
3167         if (!write)
3168                 return -EINVAL;
3169
3170         net = (struct net *)ctl->extra1;
3171         delay = net->ipv6.sysctl.flush_delay;
3172         proc_dointvec(ctl, write, buffer, lenp, ppos);
3173         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
3174         return 0;
3175 }
3176
3177 struct ctl_table ipv6_route_table_template[] = {
3178         {
3179                 .procname       =       "flush",
3180                 .data           =       &init_net.ipv6.sysctl.flush_delay,
3181                 .maxlen         =       sizeof(int),
3182                 .mode           =       0200,
3183                 .proc_handler   =       ipv6_sysctl_rtcache_flush
3184         },
3185         {
3186                 .procname       =       "gc_thresh",
3187                 .data           =       &ip6_dst_ops_template.gc_thresh,
3188                 .maxlen         =       sizeof(int),
3189                 .mode           =       0644,
3190                 .proc_handler   =       proc_dointvec,
3191         },
3192         {
3193                 .procname       =       "max_size",
3194                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
3195                 .maxlen         =       sizeof(int),
3196                 .mode           =       0644,
3197                 .proc_handler   =       proc_dointvec,
3198         },
3199         {
3200                 .procname       =       "gc_min_interval",
3201                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3202                 .maxlen         =       sizeof(int),
3203                 .mode           =       0644,
3204                 .proc_handler   =       proc_dointvec_jiffies,
3205         },
3206         {
3207                 .procname       =       "gc_timeout",
3208                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
3209                 .maxlen         =       sizeof(int),
3210                 .mode           =       0644,
3211                 .proc_handler   =       proc_dointvec_jiffies,
3212         },
3213         {
3214                 .procname       =       "gc_interval",
3215                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
3216                 .maxlen         =       sizeof(int),
3217                 .mode           =       0644,
3218                 .proc_handler   =       proc_dointvec_jiffies,
3219         },
3220         {
3221                 .procname       =       "gc_elasticity",
3222                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
3223                 .maxlen         =       sizeof(int),
3224                 .mode           =       0644,
3225                 .proc_handler   =       proc_dointvec,
3226         },
3227         {
3228                 .procname       =       "mtu_expires",
3229                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
3230                 .maxlen         =       sizeof(int),
3231                 .mode           =       0644,
3232                 .proc_handler   =       proc_dointvec_jiffies,
3233         },
3234         {
3235                 .procname       =       "min_adv_mss",
3236                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
3237                 .maxlen         =       sizeof(int),
3238                 .mode           =       0644,
3239                 .proc_handler   =       proc_dointvec,
3240         },
3241         {
3242                 .procname       =       "gc_min_interval_ms",
3243                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3244                 .maxlen         =       sizeof(int),
3245                 .mode           =       0644,
3246                 .proc_handler   =       proc_dointvec_ms_jiffies,
3247         },
3248         { }
3249 };
3250
3251 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
3252 {
3253         struct ctl_table *table;
3254
3255         table = kmemdup(ipv6_route_table_template,
3256                         sizeof(ipv6_route_table_template),
3257                         GFP_KERNEL);
3258
3259         if (table) {
3260                 table[0].data = &net->ipv6.sysctl.flush_delay;
3261                 table[0].extra1 = net;
3262                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
3263                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
3264                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3265                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
3266                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
3267                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
3268                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
3269                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
3270                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3271
3272                 /* Don't export sysctls to unprivileged users */
3273                 if (net->user_ns != &init_user_ns)
3274                         table[0].procname = NULL;
3275         }
3276
3277         return table;
3278 }
3279 #endif
3280
3281 static int __net_init ip6_route_net_init(struct net *net)
3282 {
3283         int ret = -ENOMEM;
3284
3285         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
3286                sizeof(net->ipv6.ip6_dst_ops));
3287
3288         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
3289                 goto out_ip6_dst_ops;
3290
3291         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
3292                                            sizeof(*net->ipv6.ip6_null_entry),
3293                                            GFP_KERNEL);
3294         if (!net->ipv6.ip6_null_entry)
3295                 goto out_ip6_dst_entries;
3296         net->ipv6.ip6_null_entry->dst.path =
3297                 (struct dst_entry *)net->ipv6.ip6_null_entry;
3298         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3299         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
3300                          ip6_template_metrics, true);
3301
3302 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3303         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
3304                                                sizeof(*net->ipv6.ip6_prohibit_entry),
3305                                                GFP_KERNEL);
3306         if (!net->ipv6.ip6_prohibit_entry)
3307                 goto out_ip6_null_entry;
3308         net->ipv6.ip6_prohibit_entry->dst.path =
3309                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
3310         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3311         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
3312                          ip6_template_metrics, true);
3313
3314         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
3315                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
3316                                                GFP_KERNEL);
3317         if (!net->ipv6.ip6_blk_hole_entry)
3318                 goto out_ip6_prohibit_entry;
3319         net->ipv6.ip6_blk_hole_entry->dst.path =
3320                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
3321         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3322         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
3323                          ip6_template_metrics, true);
3324 #endif
3325
3326         net->ipv6.sysctl.flush_delay = 0;
3327         net->ipv6.sysctl.ip6_rt_max_size = 4096;
3328         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
3329         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
3330         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
3331         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
3332         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
3333         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
3334
3335         net->ipv6.ip6_rt_gc_expire = 30*HZ;
3336
3337         ret = 0;
3338 out:
3339         return ret;
3340
3341 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3342 out_ip6_prohibit_entry:
3343         kfree(net->ipv6.ip6_prohibit_entry);
3344 out_ip6_null_entry:
3345         kfree(net->ipv6.ip6_null_entry);
3346 #endif
3347 out_ip6_dst_entries:
3348         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3349 out_ip6_dst_ops:
3350         goto out;
3351 }
3352
3353 static void __net_exit ip6_route_net_exit(struct net *net)
3354 {
3355         kfree(net->ipv6.ip6_null_entry);
3356 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3357         kfree(net->ipv6.ip6_prohibit_entry);
3358         kfree(net->ipv6.ip6_blk_hole_entry);
3359 #endif
3360         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3361 }
3362
3363 static int __net_init ip6_route_net_init_late(struct net *net)
3364 {
3365 #ifdef CONFIG_PROC_FS
3366         proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
3367         proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
3368 #endif
3369         return 0;
3370 }
3371
3372 static void __net_exit ip6_route_net_exit_late(struct net *net)
3373 {
3374 #ifdef CONFIG_PROC_FS
3375         remove_proc_entry("ipv6_route", net->proc_net);
3376         remove_proc_entry("rt6_stats", net->proc_net);
3377 #endif
3378 }
3379
3380 static struct pernet_operations ip6_route_net_ops = {
3381         .init = ip6_route_net_init,
3382         .exit = ip6_route_net_exit,
3383 };
3384
3385 static int __net_init ipv6_inetpeer_init(struct net *net)
3386 {
3387         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3388
3389         if (!bp)
3390                 return -ENOMEM;
3391         inet_peer_base_init(bp);
3392         net->ipv6.peers = bp;
3393         return 0;
3394 }
3395
3396 static void __net_exit ipv6_inetpeer_exit(struct net *net)
3397 {
3398         struct inet_peer_base *bp = net->ipv6.peers;
3399
3400         net->ipv6.peers = NULL;
3401         inetpeer_invalidate_tree(bp);
3402         kfree(bp);
3403 }
3404
3405 static struct pernet_operations ipv6_inetpeer_ops = {
3406         .init   =       ipv6_inetpeer_init,
3407         .exit   =       ipv6_inetpeer_exit,
3408 };
3409
3410 static struct pernet_operations ip6_route_net_late_ops = {
3411         .init = ip6_route_net_init_late,
3412         .exit = ip6_route_net_exit_late,
3413 };
3414
3415 static struct notifier_block ip6_route_dev_notifier = {
3416         .notifier_call = ip6_route_dev_notify,
3417         .priority = 0,
3418 };
3419
3420 int __init ip6_route_init(void)
3421 {
3422         int ret;
3423         int cpu;
3424
3425         ret = -ENOMEM;
3426         ip6_dst_ops_template.kmem_cachep =
3427                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
3428                                   SLAB_HWCACHE_ALIGN, NULL);
3429         if (!ip6_dst_ops_template.kmem_cachep)
3430                 goto out;
3431
3432         ret = dst_entries_init(&ip6_dst_blackhole_ops);
3433         if (ret)
3434                 goto out_kmem_cache;
3435
3436         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
3437         if (ret)
3438                 goto out_dst_entries;
3439
3440         ret = register_pernet_subsys(&ip6_route_net_ops);
3441         if (ret)
3442                 goto out_register_inetpeer;
3443
3444         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
3445
3446         /* Registering of the loopback is done before this portion of code,
3447          * the loopback reference in rt6_info will not be taken, do it
3448          * manually for init_net */
3449         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
3450         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3451   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3452         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
3453         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3454         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
3455         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3456   #endif
3457         ret = fib6_init();
3458         if (ret)
3459                 goto out_register_subsys;
3460
3461         ret = xfrm6_init();
3462         if (ret)
3463                 goto out_fib6_init;
3464
3465         ret = fib6_rules_init();
3466         if (ret)
3467                 goto xfrm6_init;
3468
3469         ret = register_pernet_subsys(&ip6_route_net_late_ops);
3470         if (ret)
3471                 goto fib6_rules_init;
3472
3473         ret = -ENOBUFS;
3474         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3475             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3476             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3477                 goto out_register_late_subsys;
3478
3479         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3480         if (ret)
3481                 goto out_register_late_subsys;
3482
3483         for_each_possible_cpu(cpu) {
3484                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
3485
3486                 INIT_LIST_HEAD(&ul->head);
3487                 spin_lock_init(&ul->lock);
3488         }
3489
3490 out:
3491         return ret;
3492
3493 out_register_late_subsys:
3494         unregister_pernet_subsys(&ip6_route_net_late_ops);
3495 fib6_rules_init:
3496         fib6_rules_cleanup();
3497 xfrm6_init:
3498         xfrm6_fini();
3499 out_fib6_init:
3500         fib6_gc_cleanup();
3501 out_register_subsys:
3502         unregister_pernet_subsys(&ip6_route_net_ops);
3503 out_register_inetpeer:
3504         unregister_pernet_subsys(&ipv6_inetpeer_ops);
3505 out_dst_entries:
3506         dst_entries_destroy(&ip6_dst_blackhole_ops);
3507 out_kmem_cache:
3508         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3509         goto out;
3510 }
3511
3512 void ip6_route_cleanup(void)
3513 {
3514         unregister_netdevice_notifier(&ip6_route_dev_notifier);
3515         unregister_pernet_subsys(&ip6_route_net_late_ops);
3516         fib6_rules_cleanup();
3517         xfrm6_fini();
3518         fib6_gc_cleanup();
3519         unregister_pernet_subsys(&ipv6_inetpeer_ops);
3520         unregister_pernet_subsys(&ip6_route_net_ops);
3521         dst_entries_destroy(&ip6_dst_blackhole_ops);
3522         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3523 }