OSDN Git Service

0a99cda9fd7b915725c79e37c173ed900f3a14d3
[uclinux-h8/linux.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
67
68 #include <linux/uaccess.h>
69
70 #ifdef CONFIG_SYSCTL
71 #include <linux/sysctl.h>
72 #endif
73
74 enum rt6_nud_state {
75         RT6_NUD_FAIL_HARD = -3,
76         RT6_NUD_FAIL_PROBE = -2,
77         RT6_NUD_FAIL_DO_RR = -1,
78         RT6_NUD_SUCCEED = 1
79 };
80
81 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
82 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
83 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
84 static unsigned int      ip6_mtu(const struct dst_entry *dst);
85 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
86 static void             ip6_dst_destroy(struct dst_entry *);
87 static void             ip6_dst_ifdown(struct dst_entry *,
88                                        struct net_device *dev, int how);
89 static int               ip6_dst_gc(struct dst_ops *ops);
90
91 static int              ip6_pkt_discard(struct sk_buff *skb);
92 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
93 static int              ip6_pkt_prohibit(struct sk_buff *skb);
94 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
95 static void             ip6_link_failure(struct sk_buff *skb);
96 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
97                                            struct sk_buff *skb, u32 mtu);
98 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
99                                         struct sk_buff *skb);
100 static void             rt6_dst_from_metrics_check(struct rt6_info *rt);
101 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
102 static size_t rt6_nlmsg_size(struct rt6_info *rt);
103 static int rt6_fill_node(struct net *net,
104                          struct sk_buff *skb, struct rt6_info *rt,
105                          struct in6_addr *dst, struct in6_addr *src,
106                          int iif, int type, u32 portid, u32 seq,
107                          unsigned int flags);
108 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
109                                            struct in6_addr *daddr,
110                                            struct in6_addr *saddr);
111
112 #ifdef CONFIG_IPV6_ROUTE_INFO
113 static struct rt6_info *rt6_add_route_info(struct net *net,
114                                            const struct in6_addr *prefix, int prefixlen,
115                                            const struct in6_addr *gwaddr,
116                                            struct net_device *dev,
117                                            unsigned int pref);
118 static struct rt6_info *rt6_get_route_info(struct net *net,
119                                            const struct in6_addr *prefix, int prefixlen,
120                                            const struct in6_addr *gwaddr,
121                                            struct net_device *dev);
122 #endif
123
124 struct uncached_list {
125         spinlock_t              lock;
126         struct list_head        head;
127 };
128
129 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
130
131 void rt6_uncached_list_add(struct rt6_info *rt)
132 {
133         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
134
135         rt->rt6i_uncached_list = ul;
136
137         spin_lock_bh(&ul->lock);
138         list_add_tail(&rt->rt6i_uncached, &ul->head);
139         spin_unlock_bh(&ul->lock);
140 }
141
142 void rt6_uncached_list_del(struct rt6_info *rt)
143 {
144         if (!list_empty(&rt->rt6i_uncached)) {
145                 struct uncached_list *ul = rt->rt6i_uncached_list;
146                 struct net *net = dev_net(rt->dst.dev);
147
148                 spin_lock_bh(&ul->lock);
149                 list_del(&rt->rt6i_uncached);
150                 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
151                 spin_unlock_bh(&ul->lock);
152         }
153 }
154
155 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
156 {
157         struct net_device *loopback_dev = net->loopback_dev;
158         int cpu;
159
160         if (dev == loopback_dev)
161                 return;
162
163         for_each_possible_cpu(cpu) {
164                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
165                 struct rt6_info *rt;
166
167                 spin_lock_bh(&ul->lock);
168                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
169                         struct inet6_dev *rt_idev = rt->rt6i_idev;
170                         struct net_device *rt_dev = rt->dst.dev;
171
172                         if (rt_idev->dev == dev) {
173                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
174                                 in6_dev_put(rt_idev);
175                         }
176
177                         if (rt_dev == dev) {
178                                 rt->dst.dev = loopback_dev;
179                                 dev_hold(rt->dst.dev);
180                                 dev_put(rt_dev);
181                         }
182                 }
183                 spin_unlock_bh(&ul->lock);
184         }
185 }
186
187 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
188 {
189         return dst_metrics_write_ptr(&rt->from->dst);
190 }
191
192 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
193 {
194         struct rt6_info *rt = (struct rt6_info *)dst;
195
196         if (rt->rt6i_flags & RTF_PCPU)
197                 return rt6_pcpu_cow_metrics(rt);
198         else if (rt->rt6i_flags & RTF_CACHE)
199                 return NULL;
200         else
201                 return dst_cow_metrics_generic(dst, old);
202 }
203
204 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
205                                              struct sk_buff *skb,
206                                              const void *daddr)
207 {
208         struct in6_addr *p = &rt->rt6i_gateway;
209
210         if (!ipv6_addr_any(p))
211                 return (const void *) p;
212         else if (skb)
213                 return &ipv6_hdr(skb)->daddr;
214         return daddr;
215 }
216
217 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
218                                           struct sk_buff *skb,
219                                           const void *daddr)
220 {
221         struct rt6_info *rt = (struct rt6_info *) dst;
222         struct neighbour *n;
223
224         daddr = choose_neigh_daddr(rt, skb, daddr);
225         n = __ipv6_neigh_lookup(dst->dev, daddr);
226         if (n)
227                 return n;
228         return neigh_create(&nd_tbl, daddr, dst->dev);
229 }
230
231 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
232 {
233         struct net_device *dev = dst->dev;
234         struct rt6_info *rt = (struct rt6_info *)dst;
235
236         daddr = choose_neigh_daddr(rt, NULL, daddr);
237         if (!daddr)
238                 return;
239         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
240                 return;
241         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
242                 return;
243         __ipv6_confirm_neigh(dev, daddr);
244 }
245
246 static struct dst_ops ip6_dst_ops_template = {
247         .family                 =       AF_INET6,
248         .gc                     =       ip6_dst_gc,
249         .gc_thresh              =       1024,
250         .check                  =       ip6_dst_check,
251         .default_advmss         =       ip6_default_advmss,
252         .mtu                    =       ip6_mtu,
253         .cow_metrics            =       ipv6_cow_metrics,
254         .destroy                =       ip6_dst_destroy,
255         .ifdown                 =       ip6_dst_ifdown,
256         .negative_advice        =       ip6_negative_advice,
257         .link_failure           =       ip6_link_failure,
258         .update_pmtu            =       ip6_rt_update_pmtu,
259         .redirect               =       rt6_do_redirect,
260         .local_out              =       __ip6_local_out,
261         .neigh_lookup           =       ip6_neigh_lookup,
262         .confirm_neigh          =       ip6_confirm_neigh,
263 };
264
265 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
266 {
267         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
268
269         return mtu ? : dst->dev->mtu;
270 }
271
272 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
273                                          struct sk_buff *skb, u32 mtu)
274 {
275 }
276
277 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
278                                       struct sk_buff *skb)
279 {
280 }
281
282 static struct dst_ops ip6_dst_blackhole_ops = {
283         .family                 =       AF_INET6,
284         .destroy                =       ip6_dst_destroy,
285         .check                  =       ip6_dst_check,
286         .mtu                    =       ip6_blackhole_mtu,
287         .default_advmss         =       ip6_default_advmss,
288         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
289         .redirect               =       ip6_rt_blackhole_redirect,
290         .cow_metrics            =       dst_cow_metrics_generic,
291         .neigh_lookup           =       ip6_neigh_lookup,
292 };
293
294 static const u32 ip6_template_metrics[RTAX_MAX] = {
295         [RTAX_HOPLIMIT - 1] = 0,
296 };
297
298 static const struct rt6_info ip6_null_entry_template = {
299         .dst = {
300                 .__refcnt       = ATOMIC_INIT(1),
301                 .__use          = 1,
302                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
303                 .error          = -ENETUNREACH,
304                 .input          = ip6_pkt_discard,
305                 .output         = ip6_pkt_discard_out,
306         },
307         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
308         .rt6i_protocol  = RTPROT_KERNEL,
309         .rt6i_metric    = ~(u32) 0,
310         .rt6i_ref       = ATOMIC_INIT(1),
311 };
312
313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
314
315 static const struct rt6_info ip6_prohibit_entry_template = {
316         .dst = {
317                 .__refcnt       = ATOMIC_INIT(1),
318                 .__use          = 1,
319                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
320                 .error          = -EACCES,
321                 .input          = ip6_pkt_prohibit,
322                 .output         = ip6_pkt_prohibit_out,
323         },
324         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
325         .rt6i_protocol  = RTPROT_KERNEL,
326         .rt6i_metric    = ~(u32) 0,
327         .rt6i_ref       = ATOMIC_INIT(1),
328 };
329
330 static const struct rt6_info ip6_blk_hole_entry_template = {
331         .dst = {
332                 .__refcnt       = ATOMIC_INIT(1),
333                 .__use          = 1,
334                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
335                 .error          = -EINVAL,
336                 .input          = dst_discard,
337                 .output         = dst_discard_out,
338         },
339         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
340         .rt6i_protocol  = RTPROT_KERNEL,
341         .rt6i_metric    = ~(u32) 0,
342         .rt6i_ref       = ATOMIC_INIT(1),
343 };
344
345 #endif
346
347 static void rt6_info_init(struct rt6_info *rt)
348 {
349         struct dst_entry *dst = &rt->dst;
350
351         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
352         INIT_LIST_HEAD(&rt->rt6i_siblings);
353         INIT_LIST_HEAD(&rt->rt6i_uncached);
354 }
355
356 /* allocate dst with ip6_dst_ops */
357 static struct rt6_info *__ip6_dst_alloc(struct net *net,
358                                         struct net_device *dev,
359                                         int flags)
360 {
361         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
362                                         1, DST_OBSOLETE_FORCE_CHK, flags);
363
364         if (rt) {
365                 rt6_info_init(rt);
366                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
367         }
368
369         return rt;
370 }
371
372 struct rt6_info *ip6_dst_alloc(struct net *net,
373                                struct net_device *dev,
374                                int flags)
375 {
376         struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
377
378         if (rt) {
379                 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
380                 if (!rt->rt6i_pcpu) {
381                         dst_release_immediate(&rt->dst);
382                         return NULL;
383                 }
384         }
385
386         return rt;
387 }
388 EXPORT_SYMBOL(ip6_dst_alloc);
389
390 static void ip6_dst_destroy(struct dst_entry *dst)
391 {
392         struct rt6_info *rt = (struct rt6_info *)dst;
393         struct rt6_exception_bucket *bucket;
394         struct rt6_info *from = rt->from;
395         struct inet6_dev *idev;
396
397         dst_destroy_metrics_generic(dst);
398         free_percpu(rt->rt6i_pcpu);
399         rt6_uncached_list_del(rt);
400
401         idev = rt->rt6i_idev;
402         if (idev) {
403                 rt->rt6i_idev = NULL;
404                 in6_dev_put(idev);
405         }
406         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1);
407         if (bucket) {
408                 rt->rt6i_exception_bucket = NULL;
409                 kfree(bucket);
410         }
411
412         rt->from = NULL;
413         dst_release(&from->dst);
414 }
415
416 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
417                            int how)
418 {
419         struct rt6_info *rt = (struct rt6_info *)dst;
420         struct inet6_dev *idev = rt->rt6i_idev;
421         struct net_device *loopback_dev =
422                 dev_net(dev)->loopback_dev;
423
424         if (idev && idev->dev != loopback_dev) {
425                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
426                 if (loopback_idev) {
427                         rt->rt6i_idev = loopback_idev;
428                         in6_dev_put(idev);
429                 }
430         }
431 }
432
433 static bool __rt6_check_expired(const struct rt6_info *rt)
434 {
435         if (rt->rt6i_flags & RTF_EXPIRES)
436                 return time_after(jiffies, rt->dst.expires);
437         else
438                 return false;
439 }
440
441 static bool rt6_check_expired(const struct rt6_info *rt)
442 {
443         if (rt->rt6i_flags & RTF_EXPIRES) {
444                 if (time_after(jiffies, rt->dst.expires))
445                         return true;
446         } else if (rt->from) {
447                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
448                         rt6_check_expired(rt->from);
449         }
450         return false;
451 }
452
453 static struct rt6_info *rt6_multipath_select(const struct net *net,
454                                              struct rt6_info *match,
455                                              struct flowi6 *fl6, int oif,
456                                              const struct sk_buff *skb,
457                                              int strict)
458 {
459         struct rt6_info *sibling, *next_sibling;
460
461         /* We might have already computed the hash for ICMPv6 errors. In such
462          * case it will always be non-zero. Otherwise now is the time to do it.
463          */
464         if (!fl6->mp_hash)
465                 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
466
467         if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound))
468                 return match;
469
470         list_for_each_entry_safe(sibling, next_sibling, &match->rt6i_siblings,
471                                  rt6i_siblings) {
472                 if (fl6->mp_hash > atomic_read(&sibling->rt6i_nh_upper_bound))
473                         continue;
474                 if (rt6_score_route(sibling, oif, strict) < 0)
475                         break;
476                 match = sibling;
477                 break;
478         }
479
480         return match;
481 }
482
483 /*
484  *      Route lookup. rcu_read_lock() should be held.
485  */
486
487 static inline struct rt6_info *rt6_device_match(struct net *net,
488                                                     struct rt6_info *rt,
489                                                     const struct in6_addr *saddr,
490                                                     int oif,
491                                                     int flags)
492 {
493         struct rt6_info *local = NULL;
494         struct rt6_info *sprt;
495
496         if (!oif && ipv6_addr_any(saddr) && !(rt->rt6i_nh_flags & RTNH_F_DEAD))
497                 return rt;
498
499         for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) {
500                 struct net_device *dev = sprt->dst.dev;
501
502                 if (sprt->rt6i_nh_flags & RTNH_F_DEAD)
503                         continue;
504
505                 if (oif) {
506                         if (dev->ifindex == oif)
507                                 return sprt;
508                         if (dev->flags & IFF_LOOPBACK) {
509                                 if (!sprt->rt6i_idev ||
510                                     sprt->rt6i_idev->dev->ifindex != oif) {
511                                         if (flags & RT6_LOOKUP_F_IFACE)
512                                                 continue;
513                                         if (local &&
514                                             local->rt6i_idev->dev->ifindex == oif)
515                                                 continue;
516                                 }
517                                 local = sprt;
518                         }
519                 } else {
520                         if (ipv6_chk_addr(net, saddr, dev,
521                                           flags & RT6_LOOKUP_F_IFACE))
522                                 return sprt;
523                 }
524         }
525
526         if (oif) {
527                 if (local)
528                         return local;
529
530                 if (flags & RT6_LOOKUP_F_IFACE)
531                         return net->ipv6.ip6_null_entry;
532         }
533
534         return rt->rt6i_nh_flags & RTNH_F_DEAD ? net->ipv6.ip6_null_entry : rt;
535 }
536
537 #ifdef CONFIG_IPV6_ROUTER_PREF
538 struct __rt6_probe_work {
539         struct work_struct work;
540         struct in6_addr target;
541         struct net_device *dev;
542 };
543
544 static void rt6_probe_deferred(struct work_struct *w)
545 {
546         struct in6_addr mcaddr;
547         struct __rt6_probe_work *work =
548                 container_of(w, struct __rt6_probe_work, work);
549
550         addrconf_addr_solict_mult(&work->target, &mcaddr);
551         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
552         dev_put(work->dev);
553         kfree(work);
554 }
555
556 static void rt6_probe(struct rt6_info *rt)
557 {
558         struct __rt6_probe_work *work;
559         struct neighbour *neigh;
560         /*
561          * Okay, this does not seem to be appropriate
562          * for now, however, we need to check if it
563          * is really so; aka Router Reachability Probing.
564          *
565          * Router Reachability Probe MUST be rate-limited
566          * to no more than one per minute.
567          */
568         if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
569                 return;
570         rcu_read_lock_bh();
571         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
572         if (neigh) {
573                 if (neigh->nud_state & NUD_VALID)
574                         goto out;
575
576                 work = NULL;
577                 write_lock(&neigh->lock);
578                 if (!(neigh->nud_state & NUD_VALID) &&
579                     time_after(jiffies,
580                                neigh->updated +
581                                rt->rt6i_idev->cnf.rtr_probe_interval)) {
582                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
583                         if (work)
584                                 __neigh_set_probe_once(neigh);
585                 }
586                 write_unlock(&neigh->lock);
587         } else {
588                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
589         }
590
591         if (work) {
592                 INIT_WORK(&work->work, rt6_probe_deferred);
593                 work->target = rt->rt6i_gateway;
594                 dev_hold(rt->dst.dev);
595                 work->dev = rt->dst.dev;
596                 schedule_work(&work->work);
597         }
598
599 out:
600         rcu_read_unlock_bh();
601 }
602 #else
603 static inline void rt6_probe(struct rt6_info *rt)
604 {
605 }
606 #endif
607
608 /*
609  * Default Router Selection (RFC 2461 6.3.6)
610  */
611 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
612 {
613         struct net_device *dev = rt->dst.dev;
614         if (!oif || dev->ifindex == oif)
615                 return 2;
616         if ((dev->flags & IFF_LOOPBACK) &&
617             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
618                 return 1;
619         return 0;
620 }
621
622 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
623 {
624         struct neighbour *neigh;
625         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
626
627         if (rt->rt6i_flags & RTF_NONEXTHOP ||
628             !(rt->rt6i_flags & RTF_GATEWAY))
629                 return RT6_NUD_SUCCEED;
630
631         rcu_read_lock_bh();
632         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
633         if (neigh) {
634                 read_lock(&neigh->lock);
635                 if (neigh->nud_state & NUD_VALID)
636                         ret = RT6_NUD_SUCCEED;
637 #ifdef CONFIG_IPV6_ROUTER_PREF
638                 else if (!(neigh->nud_state & NUD_FAILED))
639                         ret = RT6_NUD_SUCCEED;
640                 else
641                         ret = RT6_NUD_FAIL_PROBE;
642 #endif
643                 read_unlock(&neigh->lock);
644         } else {
645                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
646                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
647         }
648         rcu_read_unlock_bh();
649
650         return ret;
651 }
652
653 static int rt6_score_route(struct rt6_info *rt, int oif,
654                            int strict)
655 {
656         int m;
657
658         m = rt6_check_dev(rt, oif);
659         if (!m && (strict & RT6_LOOKUP_F_IFACE))
660                 return RT6_NUD_FAIL_HARD;
661 #ifdef CONFIG_IPV6_ROUTER_PREF
662         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
663 #endif
664         if (strict & RT6_LOOKUP_F_REACHABLE) {
665                 int n = rt6_check_neigh(rt);
666                 if (n < 0)
667                         return n;
668         }
669         return m;
670 }
671
672 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
673                                    int *mpri, struct rt6_info *match,
674                                    bool *do_rr)
675 {
676         int m;
677         bool match_do_rr = false;
678         struct inet6_dev *idev = rt->rt6i_idev;
679
680         if (rt->rt6i_nh_flags & RTNH_F_DEAD)
681                 goto out;
682
683         if (idev->cnf.ignore_routes_with_linkdown &&
684             rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
685             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
686                 goto out;
687
688         if (rt6_check_expired(rt))
689                 goto out;
690
691         m = rt6_score_route(rt, oif, strict);
692         if (m == RT6_NUD_FAIL_DO_RR) {
693                 match_do_rr = true;
694                 m = 0; /* lowest valid score */
695         } else if (m == RT6_NUD_FAIL_HARD) {
696                 goto out;
697         }
698
699         if (strict & RT6_LOOKUP_F_REACHABLE)
700                 rt6_probe(rt);
701
702         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
703         if (m > *mpri) {
704                 *do_rr = match_do_rr;
705                 *mpri = m;
706                 match = rt;
707         }
708 out:
709         return match;
710 }
711
712 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
713                                      struct rt6_info *leaf,
714                                      struct rt6_info *rr_head,
715                                      u32 metric, int oif, int strict,
716                                      bool *do_rr)
717 {
718         struct rt6_info *rt, *match, *cont;
719         int mpri = -1;
720
721         match = NULL;
722         cont = NULL;
723         for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) {
724                 if (rt->rt6i_metric != metric) {
725                         cont = rt;
726                         break;
727                 }
728
729                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
730         }
731
732         for (rt = leaf; rt && rt != rr_head;
733              rt = rcu_dereference(rt->rt6_next)) {
734                 if (rt->rt6i_metric != metric) {
735                         cont = rt;
736                         break;
737                 }
738
739                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
740         }
741
742         if (match || !cont)
743                 return match;
744
745         for (rt = cont; rt; rt = rcu_dereference(rt->rt6_next))
746                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
747
748         return match;
749 }
750
751 static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
752                                    int oif, int strict)
753 {
754         struct rt6_info *leaf = rcu_dereference(fn->leaf);
755         struct rt6_info *match, *rt0;
756         bool do_rr = false;
757         int key_plen;
758
759         if (!leaf || leaf == net->ipv6.ip6_null_entry)
760                 return net->ipv6.ip6_null_entry;
761
762         rt0 = rcu_dereference(fn->rr_ptr);
763         if (!rt0)
764                 rt0 = leaf;
765
766         /* Double check to make sure fn is not an intermediate node
767          * and fn->leaf does not points to its child's leaf
768          * (This might happen if all routes under fn are deleted from
769          * the tree and fib6_repair_tree() is called on the node.)
770          */
771         key_plen = rt0->rt6i_dst.plen;
772 #ifdef CONFIG_IPV6_SUBTREES
773         if (rt0->rt6i_src.plen)
774                 key_plen = rt0->rt6i_src.plen;
775 #endif
776         if (fn->fn_bit != key_plen)
777                 return net->ipv6.ip6_null_entry;
778
779         match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict,
780                              &do_rr);
781
782         if (do_rr) {
783                 struct rt6_info *next = rcu_dereference(rt0->rt6_next);
784
785                 /* no entries matched; do round-robin */
786                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
787                         next = leaf;
788
789                 if (next != rt0) {
790                         spin_lock_bh(&leaf->rt6i_table->tb6_lock);
791                         /* make sure next is not being deleted from the tree */
792                         if (next->rt6i_node)
793                                 rcu_assign_pointer(fn->rr_ptr, next);
794                         spin_unlock_bh(&leaf->rt6i_table->tb6_lock);
795                 }
796         }
797
798         return match ? match : net->ipv6.ip6_null_entry;
799 }
800
801 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
802 {
803         return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
804 }
805
806 #ifdef CONFIG_IPV6_ROUTE_INFO
807 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
808                   const struct in6_addr *gwaddr)
809 {
810         struct net *net = dev_net(dev);
811         struct route_info *rinfo = (struct route_info *) opt;
812         struct in6_addr prefix_buf, *prefix;
813         unsigned int pref;
814         unsigned long lifetime;
815         struct rt6_info *rt;
816
817         if (len < sizeof(struct route_info)) {
818                 return -EINVAL;
819         }
820
821         /* Sanity check for prefix_len and length */
822         if (rinfo->length > 3) {
823                 return -EINVAL;
824         } else if (rinfo->prefix_len > 128) {
825                 return -EINVAL;
826         } else if (rinfo->prefix_len > 64) {
827                 if (rinfo->length < 2) {
828                         return -EINVAL;
829                 }
830         } else if (rinfo->prefix_len > 0) {
831                 if (rinfo->length < 1) {
832                         return -EINVAL;
833                 }
834         }
835
836         pref = rinfo->route_pref;
837         if (pref == ICMPV6_ROUTER_PREF_INVALID)
838                 return -EINVAL;
839
840         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
841
842         if (rinfo->length == 3)
843                 prefix = (struct in6_addr *)rinfo->prefix;
844         else {
845                 /* this function is safe */
846                 ipv6_addr_prefix(&prefix_buf,
847                                  (struct in6_addr *)rinfo->prefix,
848                                  rinfo->prefix_len);
849                 prefix = &prefix_buf;
850         }
851
852         if (rinfo->prefix_len == 0)
853                 rt = rt6_get_dflt_router(gwaddr, dev);
854         else
855                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
856                                         gwaddr, dev);
857
858         if (rt && !lifetime) {
859                 ip6_del_rt(rt);
860                 rt = NULL;
861         }
862
863         if (!rt && lifetime)
864                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
865                                         dev, pref);
866         else if (rt)
867                 rt->rt6i_flags = RTF_ROUTEINFO |
868                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
869
870         if (rt) {
871                 if (!addrconf_finite_timeout(lifetime))
872                         rt6_clean_expires(rt);
873                 else
874                         rt6_set_expires(rt, jiffies + HZ * lifetime);
875
876                 ip6_rt_put(rt);
877         }
878         return 0;
879 }
880 #endif
881
882 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
883                                         struct in6_addr *saddr)
884 {
885         struct fib6_node *pn, *sn;
886         while (1) {
887                 if (fn->fn_flags & RTN_TL_ROOT)
888                         return NULL;
889                 pn = rcu_dereference(fn->parent);
890                 sn = FIB6_SUBTREE(pn);
891                 if (sn && sn != fn)
892                         fn = fib6_lookup(sn, NULL, saddr);
893                 else
894                         fn = pn;
895                 if (fn->fn_flags & RTN_RTINFO)
896                         return fn;
897         }
898 }
899
900 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
901                           bool null_fallback)
902 {
903         struct rt6_info *rt = *prt;
904
905         if (dst_hold_safe(&rt->dst))
906                 return true;
907         if (null_fallback) {
908                 rt = net->ipv6.ip6_null_entry;
909                 dst_hold(&rt->dst);
910         } else {
911                 rt = NULL;
912         }
913         *prt = rt;
914         return false;
915 }
916
917 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
918                                              struct fib6_table *table,
919                                              struct flowi6 *fl6,
920                                              const struct sk_buff *skb,
921                                              int flags)
922 {
923         struct rt6_info *rt, *rt_cache;
924         struct fib6_node *fn;
925
926         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
927                 flags &= ~RT6_LOOKUP_F_IFACE;
928
929         rcu_read_lock();
930         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
931 restart:
932         rt = rcu_dereference(fn->leaf);
933         if (!rt) {
934                 rt = net->ipv6.ip6_null_entry;
935         } else {
936                 rt = rt6_device_match(net, rt, &fl6->saddr,
937                                       fl6->flowi6_oif, flags);
938                 if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
939                         rt = rt6_multipath_select(net, rt, fl6, fl6->flowi6_oif,
940                                                   skb, flags);
941         }
942         if (rt == net->ipv6.ip6_null_entry) {
943                 fn = fib6_backtrack(fn, &fl6->saddr);
944                 if (fn)
945                         goto restart;
946         }
947         /* Search through exception table */
948         rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
949         if (rt_cache)
950                 rt = rt_cache;
951
952         if (ip6_hold_safe(net, &rt, true))
953                 dst_use_noref(&rt->dst, jiffies);
954
955         rcu_read_unlock();
956
957         trace_fib6_table_lookup(net, rt, table, fl6);
958
959         return rt;
960
961 }
962
963 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
964                                    const struct sk_buff *skb, int flags)
965 {
966         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
967 }
968 EXPORT_SYMBOL_GPL(ip6_route_lookup);
969
970 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
971                             const struct in6_addr *saddr, int oif,
972                             const struct sk_buff *skb, int strict)
973 {
974         struct flowi6 fl6 = {
975                 .flowi6_oif = oif,
976                 .daddr = *daddr,
977         };
978         struct dst_entry *dst;
979         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
980
981         if (saddr) {
982                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
983                 flags |= RT6_LOOKUP_F_HAS_SADDR;
984         }
985
986         dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
987         if (dst->error == 0)
988                 return (struct rt6_info *) dst;
989
990         dst_release(dst);
991
992         return NULL;
993 }
994 EXPORT_SYMBOL(rt6_lookup);
995
996 /* ip6_ins_rt is called with FREE table->tb6_lock.
997  * It takes new route entry, the addition fails by any reason the
998  * route is released.
999  * Caller must hold dst before calling it.
1000  */
1001
1002 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
1003                         struct mx6_config *mxc,
1004                         struct netlink_ext_ack *extack)
1005 {
1006         int err;
1007         struct fib6_table *table;
1008
1009         table = rt->rt6i_table;
1010         spin_lock_bh(&table->tb6_lock);
1011         err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
1012         spin_unlock_bh(&table->tb6_lock);
1013
1014         return err;
1015 }
1016
1017 int ip6_ins_rt(struct rt6_info *rt)
1018 {
1019         struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
1020         struct mx6_config mxc = { .mx = NULL, };
1021
1022         /* Hold dst to account for the reference from the fib6 tree */
1023         dst_hold(&rt->dst);
1024         return __ip6_ins_rt(rt, &info, &mxc, NULL);
1025 }
1026
1027 /* called with rcu_lock held */
1028 static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
1029 {
1030         struct net_device *dev = rt->dst.dev;
1031
1032         if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) {
1033                 /* for copies of local routes, dst->dev needs to be the
1034                  * device if it is a master device, the master device if
1035                  * device is enslaved, and the loopback as the default
1036                  */
1037                 if (netif_is_l3_slave(dev) &&
1038                     !rt6_need_strict(&rt->rt6i_dst.addr))
1039                         dev = l3mdev_master_dev_rcu(dev);
1040                 else if (!netif_is_l3_master(dev))
1041                         dev = dev_net(dev)->loopback_dev;
1042                 /* last case is netif_is_l3_master(dev) is true in which
1043                  * case we want dev returned to be dev
1044                  */
1045         }
1046
1047         return dev;
1048 }
1049
1050 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
1051                                            const struct in6_addr *daddr,
1052                                            const struct in6_addr *saddr)
1053 {
1054         struct net_device *dev;
1055         struct rt6_info *rt;
1056
1057         /*
1058          *      Clone the route.
1059          */
1060
1061         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1062                 ort = ort->from;
1063
1064         rcu_read_lock();
1065         dev = ip6_rt_get_dev_rcu(ort);
1066         rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
1067         rcu_read_unlock();
1068         if (!rt)
1069                 return NULL;
1070
1071         ip6_rt_copy_init(rt, ort);
1072         rt->rt6i_flags |= RTF_CACHE;
1073         rt->rt6i_metric = 0;
1074         rt->dst.flags |= DST_HOST;
1075         rt->rt6i_dst.addr = *daddr;
1076         rt->rt6i_dst.plen = 128;
1077
1078         if (!rt6_is_gw_or_nonexthop(ort)) {
1079                 if (ort->rt6i_dst.plen != 128 &&
1080                     ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
1081                         rt->rt6i_flags |= RTF_ANYCAST;
1082 #ifdef CONFIG_IPV6_SUBTREES
1083                 if (rt->rt6i_src.plen && saddr) {
1084                         rt->rt6i_src.addr = *saddr;
1085                         rt->rt6i_src.plen = 128;
1086                 }
1087 #endif
1088         }
1089
1090         return rt;
1091 }
1092
1093 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1094 {
1095         struct net_device *dev;
1096         struct rt6_info *pcpu_rt;
1097
1098         rcu_read_lock();
1099         dev = ip6_rt_get_dev_rcu(rt);
1100         pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
1101         rcu_read_unlock();
1102         if (!pcpu_rt)
1103                 return NULL;
1104         ip6_rt_copy_init(pcpu_rt, rt);
1105         pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1106         pcpu_rt->rt6i_flags |= RTF_PCPU;
1107         return pcpu_rt;
1108 }
1109
1110 /* It should be called with rcu_read_lock() acquired */
1111 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1112 {
1113         struct rt6_info *pcpu_rt, **p;
1114
1115         p = this_cpu_ptr(rt->rt6i_pcpu);
1116         pcpu_rt = *p;
1117
1118         if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false))
1119                 rt6_dst_from_metrics_check(pcpu_rt);
1120
1121         return pcpu_rt;
1122 }
1123
1124 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1125 {
1126         struct rt6_info *pcpu_rt, *prev, **p;
1127
1128         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1129         if (!pcpu_rt) {
1130                 struct net *net = dev_net(rt->dst.dev);
1131
1132                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1133                 return net->ipv6.ip6_null_entry;
1134         }
1135
1136         dst_hold(&pcpu_rt->dst);
1137         p = this_cpu_ptr(rt->rt6i_pcpu);
1138         prev = cmpxchg(p, NULL, pcpu_rt);
1139         BUG_ON(prev);
1140
1141         rt6_dst_from_metrics_check(pcpu_rt);
1142         return pcpu_rt;
1143 }
1144
1145 /* exception hash table implementation
1146  */
1147 static DEFINE_SPINLOCK(rt6_exception_lock);
1148
1149 /* Remove rt6_ex from hash table and free the memory
1150  * Caller must hold rt6_exception_lock
1151  */
1152 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1153                                  struct rt6_exception *rt6_ex)
1154 {
1155         struct net *net;
1156
1157         if (!bucket || !rt6_ex)
1158                 return;
1159
1160         net = dev_net(rt6_ex->rt6i->dst.dev);
1161         rt6_ex->rt6i->rt6i_node = NULL;
1162         hlist_del_rcu(&rt6_ex->hlist);
1163         rt6_release(rt6_ex->rt6i);
1164         kfree_rcu(rt6_ex, rcu);
1165         WARN_ON_ONCE(!bucket->depth);
1166         bucket->depth--;
1167         net->ipv6.rt6_stats->fib_rt_cache--;
1168 }
1169
1170 /* Remove oldest rt6_ex in bucket and free the memory
1171  * Caller must hold rt6_exception_lock
1172  */
1173 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1174 {
1175         struct rt6_exception *rt6_ex, *oldest = NULL;
1176
1177         if (!bucket)
1178                 return;
1179
1180         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1181                 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1182                         oldest = rt6_ex;
1183         }
1184         rt6_remove_exception(bucket, oldest);
1185 }
1186
1187 static u32 rt6_exception_hash(const struct in6_addr *dst,
1188                               const struct in6_addr *src)
1189 {
1190         static u32 seed __read_mostly;
1191         u32 val;
1192
1193         net_get_random_once(&seed, sizeof(seed));
1194         val = jhash(dst, sizeof(*dst), seed);
1195
1196 #ifdef CONFIG_IPV6_SUBTREES
1197         if (src)
1198                 val = jhash(src, sizeof(*src), val);
1199 #endif
1200         return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1201 }
1202
1203 /* Helper function to find the cached rt in the hash table
1204  * and update bucket pointer to point to the bucket for this
1205  * (daddr, saddr) pair
1206  * Caller must hold rt6_exception_lock
1207  */
1208 static struct rt6_exception *
1209 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1210                               const struct in6_addr *daddr,
1211                               const struct in6_addr *saddr)
1212 {
1213         struct rt6_exception *rt6_ex;
1214         u32 hval;
1215
1216         if (!(*bucket) || !daddr)
1217                 return NULL;
1218
1219         hval = rt6_exception_hash(daddr, saddr);
1220         *bucket += hval;
1221
1222         hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1223                 struct rt6_info *rt6 = rt6_ex->rt6i;
1224                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1225
1226 #ifdef CONFIG_IPV6_SUBTREES
1227                 if (matched && saddr)
1228                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1229 #endif
1230                 if (matched)
1231                         return rt6_ex;
1232         }
1233         return NULL;
1234 }
1235
1236 /* Helper function to find the cached rt in the hash table
1237  * and update bucket pointer to point to the bucket for this
1238  * (daddr, saddr) pair
1239  * Caller must hold rcu_read_lock()
1240  */
1241 static struct rt6_exception *
1242 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1243                          const struct in6_addr *daddr,
1244                          const struct in6_addr *saddr)
1245 {
1246         struct rt6_exception *rt6_ex;
1247         u32 hval;
1248
1249         WARN_ON_ONCE(!rcu_read_lock_held());
1250
1251         if (!(*bucket) || !daddr)
1252                 return NULL;
1253
1254         hval = rt6_exception_hash(daddr, saddr);
1255         *bucket += hval;
1256
1257         hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1258                 struct rt6_info *rt6 = rt6_ex->rt6i;
1259                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1260
1261 #ifdef CONFIG_IPV6_SUBTREES
1262                 if (matched && saddr)
1263                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1264 #endif
1265                 if (matched)
1266                         return rt6_ex;
1267         }
1268         return NULL;
1269 }
1270
1271 static int rt6_insert_exception(struct rt6_info *nrt,
1272                                 struct rt6_info *ort)
1273 {
1274         struct net *net = dev_net(ort->dst.dev);
1275         struct rt6_exception_bucket *bucket;
1276         struct in6_addr *src_key = NULL;
1277         struct rt6_exception *rt6_ex;
1278         int err = 0;
1279
1280         /* ort can't be a cache or pcpu route */
1281         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1282                 ort = ort->from;
1283         WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
1284
1285         spin_lock_bh(&rt6_exception_lock);
1286
1287         if (ort->exception_bucket_flushed) {
1288                 err = -EINVAL;
1289                 goto out;
1290         }
1291
1292         bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1293                                         lockdep_is_held(&rt6_exception_lock));
1294         if (!bucket) {
1295                 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1296                                  GFP_ATOMIC);
1297                 if (!bucket) {
1298                         err = -ENOMEM;
1299                         goto out;
1300                 }
1301                 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1302         }
1303
1304 #ifdef CONFIG_IPV6_SUBTREES
1305         /* rt6i_src.plen != 0 indicates ort is in subtree
1306          * and exception table is indexed by a hash of
1307          * both rt6i_dst and rt6i_src.
1308          * Otherwise, the exception table is indexed by
1309          * a hash of only rt6i_dst.
1310          */
1311         if (ort->rt6i_src.plen)
1312                 src_key = &nrt->rt6i_src.addr;
1313 #endif
1314
1315         /* Update rt6i_prefsrc as it could be changed
1316          * in rt6_remove_prefsrc()
1317          */
1318         nrt->rt6i_prefsrc = ort->rt6i_prefsrc;
1319         /* rt6_mtu_change() might lower mtu on ort.
1320          * Only insert this exception route if its mtu
1321          * is less than ort's mtu value.
1322          */
1323         if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) {
1324                 err = -EINVAL;
1325                 goto out;
1326         }
1327
1328         rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1329                                                src_key);
1330         if (rt6_ex)
1331                 rt6_remove_exception(bucket, rt6_ex);
1332
1333         rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1334         if (!rt6_ex) {
1335                 err = -ENOMEM;
1336                 goto out;
1337         }
1338         rt6_ex->rt6i = nrt;
1339         rt6_ex->stamp = jiffies;
1340         atomic_inc(&nrt->rt6i_ref);
1341         nrt->rt6i_node = ort->rt6i_node;
1342         hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1343         bucket->depth++;
1344         net->ipv6.rt6_stats->fib_rt_cache++;
1345
1346         if (bucket->depth > FIB6_MAX_DEPTH)
1347                 rt6_exception_remove_oldest(bucket);
1348
1349 out:
1350         spin_unlock_bh(&rt6_exception_lock);
1351
1352         /* Update fn->fn_sernum to invalidate all cached dst */
1353         if (!err) {
1354                 spin_lock_bh(&ort->rt6i_table->tb6_lock);
1355                 fib6_update_sernum(net, ort);
1356                 spin_unlock_bh(&ort->rt6i_table->tb6_lock);
1357                 fib6_force_start_gc(net);
1358         }
1359
1360         return err;
1361 }
1362
1363 void rt6_flush_exceptions(struct rt6_info *rt)
1364 {
1365         struct rt6_exception_bucket *bucket;
1366         struct rt6_exception *rt6_ex;
1367         struct hlist_node *tmp;
1368         int i;
1369
1370         spin_lock_bh(&rt6_exception_lock);
1371         /* Prevent rt6_insert_exception() to recreate the bucket list */
1372         rt->exception_bucket_flushed = 1;
1373
1374         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1375                                     lockdep_is_held(&rt6_exception_lock));
1376         if (!bucket)
1377                 goto out;
1378
1379         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1380                 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1381                         rt6_remove_exception(bucket, rt6_ex);
1382                 WARN_ON_ONCE(bucket->depth);
1383                 bucket++;
1384         }
1385
1386 out:
1387         spin_unlock_bh(&rt6_exception_lock);
1388 }
1389
1390 /* Find cached rt in the hash table inside passed in rt
1391  * Caller has to hold rcu_read_lock()
1392  */
1393 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
1394                                            struct in6_addr *daddr,
1395                                            struct in6_addr *saddr)
1396 {
1397         struct rt6_exception_bucket *bucket;
1398         struct in6_addr *src_key = NULL;
1399         struct rt6_exception *rt6_ex;
1400         struct rt6_info *res = NULL;
1401
1402         bucket = rcu_dereference(rt->rt6i_exception_bucket);
1403
1404 #ifdef CONFIG_IPV6_SUBTREES
1405         /* rt6i_src.plen != 0 indicates rt is in subtree
1406          * and exception table is indexed by a hash of
1407          * both rt6i_dst and rt6i_src.
1408          * Otherwise, the exception table is indexed by
1409          * a hash of only rt6i_dst.
1410          */
1411         if (rt->rt6i_src.plen)
1412                 src_key = saddr;
1413 #endif
1414         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1415
1416         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1417                 res = rt6_ex->rt6i;
1418
1419         return res;
1420 }
1421
1422 /* Remove the passed in cached rt from the hash table that contains it */
1423 int rt6_remove_exception_rt(struct rt6_info *rt)
1424 {
1425         struct rt6_exception_bucket *bucket;
1426         struct rt6_info *from = rt->from;
1427         struct in6_addr *src_key = NULL;
1428         struct rt6_exception *rt6_ex;
1429         int err;
1430
1431         if (!from ||
1432             !(rt->rt6i_flags & RTF_CACHE))
1433                 return -EINVAL;
1434
1435         if (!rcu_access_pointer(from->rt6i_exception_bucket))
1436                 return -ENOENT;
1437
1438         spin_lock_bh(&rt6_exception_lock);
1439         bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1440                                     lockdep_is_held(&rt6_exception_lock));
1441 #ifdef CONFIG_IPV6_SUBTREES
1442         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1443          * and exception table is indexed by a hash of
1444          * both rt6i_dst and rt6i_src.
1445          * Otherwise, the exception table is indexed by
1446          * a hash of only rt6i_dst.
1447          */
1448         if (from->rt6i_src.plen)
1449                 src_key = &rt->rt6i_src.addr;
1450 #endif
1451         rt6_ex = __rt6_find_exception_spinlock(&bucket,
1452                                                &rt->rt6i_dst.addr,
1453                                                src_key);
1454         if (rt6_ex) {
1455                 rt6_remove_exception(bucket, rt6_ex);
1456                 err = 0;
1457         } else {
1458                 err = -ENOENT;
1459         }
1460
1461         spin_unlock_bh(&rt6_exception_lock);
1462         return err;
1463 }
1464
1465 /* Find rt6_ex which contains the passed in rt cache and
1466  * refresh its stamp
1467  */
1468 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1469 {
1470         struct rt6_exception_bucket *bucket;
1471         struct rt6_info *from = rt->from;
1472         struct in6_addr *src_key = NULL;
1473         struct rt6_exception *rt6_ex;
1474
1475         if (!from ||
1476             !(rt->rt6i_flags & RTF_CACHE))
1477                 return;
1478
1479         rcu_read_lock();
1480         bucket = rcu_dereference(from->rt6i_exception_bucket);
1481
1482 #ifdef CONFIG_IPV6_SUBTREES
1483         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1484          * and exception table is indexed by a hash of
1485          * both rt6i_dst and rt6i_src.
1486          * Otherwise, the exception table is indexed by
1487          * a hash of only rt6i_dst.
1488          */
1489         if (from->rt6i_src.plen)
1490                 src_key = &rt->rt6i_src.addr;
1491 #endif
1492         rt6_ex = __rt6_find_exception_rcu(&bucket,
1493                                           &rt->rt6i_dst.addr,
1494                                           src_key);
1495         if (rt6_ex)
1496                 rt6_ex->stamp = jiffies;
1497
1498         rcu_read_unlock();
1499 }
1500
1501 static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt)
1502 {
1503         struct rt6_exception_bucket *bucket;
1504         struct rt6_exception *rt6_ex;
1505         int i;
1506
1507         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1508                                         lockdep_is_held(&rt6_exception_lock));
1509
1510         if (bucket) {
1511                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1512                         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1513                                 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1514                         }
1515                         bucket++;
1516                 }
1517         }
1518 }
1519
1520 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1521                                          struct rt6_info *rt, int mtu)
1522 {
1523         /* If the new MTU is lower than the route PMTU, this new MTU will be the
1524          * lowest MTU in the path: always allow updating the route PMTU to
1525          * reflect PMTU decreases.
1526          *
1527          * If the new MTU is higher, and the route PMTU is equal to the local
1528          * MTU, this means the old MTU is the lowest in the path, so allow
1529          * updating it: if other nodes now have lower MTUs, PMTU discovery will
1530          * handle this.
1531          */
1532
1533         if (dst_mtu(&rt->dst) >= mtu)
1534                 return true;
1535
1536         if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1537                 return true;
1538
1539         return false;
1540 }
1541
1542 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1543                                        struct rt6_info *rt, int mtu)
1544 {
1545         struct rt6_exception_bucket *bucket;
1546         struct rt6_exception *rt6_ex;
1547         int i;
1548
1549         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1550                                         lockdep_is_held(&rt6_exception_lock));
1551
1552         if (!bucket)
1553                 return;
1554
1555         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1556                 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1557                         struct rt6_info *entry = rt6_ex->rt6i;
1558
1559                         /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1560                          * route), the metrics of its rt->dst.from have already
1561                          * been updated.
1562                          */
1563                         if (entry->rt6i_pmtu &&
1564                             rt6_mtu_change_route_allowed(idev, entry, mtu))
1565                                 entry->rt6i_pmtu = mtu;
1566                 }
1567                 bucket++;
1568         }
1569 }
1570
1571 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
1572
1573 static void rt6_exceptions_clean_tohost(struct rt6_info *rt,
1574                                         struct in6_addr *gateway)
1575 {
1576         struct rt6_exception_bucket *bucket;
1577         struct rt6_exception *rt6_ex;
1578         struct hlist_node *tmp;
1579         int i;
1580
1581         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1582                 return;
1583
1584         spin_lock_bh(&rt6_exception_lock);
1585         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1586                                      lockdep_is_held(&rt6_exception_lock));
1587
1588         if (bucket) {
1589                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1590                         hlist_for_each_entry_safe(rt6_ex, tmp,
1591                                                   &bucket->chain, hlist) {
1592                                 struct rt6_info *entry = rt6_ex->rt6i;
1593
1594                                 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1595                                     RTF_CACHE_GATEWAY &&
1596                                     ipv6_addr_equal(gateway,
1597                                                     &entry->rt6i_gateway)) {
1598                                         rt6_remove_exception(bucket, rt6_ex);
1599                                 }
1600                         }
1601                         bucket++;
1602                 }
1603         }
1604
1605         spin_unlock_bh(&rt6_exception_lock);
1606 }
1607
1608 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1609                                       struct rt6_exception *rt6_ex,
1610                                       struct fib6_gc_args *gc_args,
1611                                       unsigned long now)
1612 {
1613         struct rt6_info *rt = rt6_ex->rt6i;
1614
1615         /* we are pruning and obsoleting aged-out and non gateway exceptions
1616          * even if others have still references to them, so that on next
1617          * dst_check() such references can be dropped.
1618          * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1619          * expired, independently from their aging, as per RFC 8201 section 4
1620          */
1621         if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1622                 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1623                         RT6_TRACE("aging clone %p\n", rt);
1624                         rt6_remove_exception(bucket, rt6_ex);
1625                         return;
1626                 }
1627         } else if (time_after(jiffies, rt->dst.expires)) {
1628                 RT6_TRACE("purging expired route %p\n", rt);
1629                 rt6_remove_exception(bucket, rt6_ex);
1630                 return;
1631         }
1632
1633         if (rt->rt6i_flags & RTF_GATEWAY) {
1634                 struct neighbour *neigh;
1635                 __u8 neigh_flags = 0;
1636
1637                 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1638                 if (neigh)
1639                         neigh_flags = neigh->flags;
1640
1641                 if (!(neigh_flags & NTF_ROUTER)) {
1642                         RT6_TRACE("purging route %p via non-router but gateway\n",
1643                                   rt);
1644                         rt6_remove_exception(bucket, rt6_ex);
1645                         return;
1646                 }
1647         }
1648
1649         gc_args->more++;
1650 }
1651
1652 void rt6_age_exceptions(struct rt6_info *rt,
1653                         struct fib6_gc_args *gc_args,
1654                         unsigned long now)
1655 {
1656         struct rt6_exception_bucket *bucket;
1657         struct rt6_exception *rt6_ex;
1658         struct hlist_node *tmp;
1659         int i;
1660
1661         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1662                 return;
1663
1664         rcu_read_lock_bh();
1665         spin_lock(&rt6_exception_lock);
1666         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1667                                     lockdep_is_held(&rt6_exception_lock));
1668
1669         if (bucket) {
1670                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1671                         hlist_for_each_entry_safe(rt6_ex, tmp,
1672                                                   &bucket->chain, hlist) {
1673                                 rt6_age_examine_exception(bucket, rt6_ex,
1674                                                           gc_args, now);
1675                         }
1676                         bucket++;
1677                 }
1678         }
1679         spin_unlock(&rt6_exception_lock);
1680         rcu_read_unlock_bh();
1681 }
1682
1683 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1684                                int oif, struct flowi6 *fl6,
1685                                const struct sk_buff *skb, int flags)
1686 {
1687         struct fib6_node *fn, *saved_fn;
1688         struct rt6_info *rt, *rt_cache;
1689         int strict = 0;
1690
1691         strict |= flags & RT6_LOOKUP_F_IFACE;
1692         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1693         if (net->ipv6.devconf_all->forwarding == 0)
1694                 strict |= RT6_LOOKUP_F_REACHABLE;
1695
1696         rcu_read_lock();
1697
1698         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1699         saved_fn = fn;
1700
1701         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1702                 oif = 0;
1703
1704 redo_rt6_select:
1705         rt = rt6_select(net, fn, oif, strict);
1706         if (rt->rt6i_nsiblings)
1707                 rt = rt6_multipath_select(net, rt, fl6, oif, skb, strict);
1708         if (rt == net->ipv6.ip6_null_entry) {
1709                 fn = fib6_backtrack(fn, &fl6->saddr);
1710                 if (fn)
1711                         goto redo_rt6_select;
1712                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1713                         /* also consider unreachable route */
1714                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1715                         fn = saved_fn;
1716                         goto redo_rt6_select;
1717                 }
1718         }
1719
1720         /*Search through exception table */
1721         rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
1722         if (rt_cache)
1723                 rt = rt_cache;
1724
1725         if (rt == net->ipv6.ip6_null_entry) {
1726                 rcu_read_unlock();
1727                 dst_hold(&rt->dst);
1728                 trace_fib6_table_lookup(net, rt, table, fl6);
1729                 return rt;
1730         } else if (rt->rt6i_flags & RTF_CACHE) {
1731                 if (ip6_hold_safe(net, &rt, true)) {
1732                         dst_use_noref(&rt->dst, jiffies);
1733                         rt6_dst_from_metrics_check(rt);
1734                 }
1735                 rcu_read_unlock();
1736                 trace_fib6_table_lookup(net, rt, table, fl6);
1737                 return rt;
1738         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1739                             !(rt->rt6i_flags & RTF_GATEWAY))) {
1740                 /* Create a RTF_CACHE clone which will not be
1741                  * owned by the fib6 tree.  It is for the special case where
1742                  * the daddr in the skb during the neighbor look-up is different
1743                  * from the fl6->daddr used to look-up route here.
1744                  */
1745
1746                 struct rt6_info *uncached_rt;
1747
1748                 if (ip6_hold_safe(net, &rt, true)) {
1749                         dst_use_noref(&rt->dst, jiffies);
1750                 } else {
1751                         rcu_read_unlock();
1752                         uncached_rt = rt;
1753                         goto uncached_rt_out;
1754                 }
1755                 rcu_read_unlock();
1756
1757                 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1758                 dst_release(&rt->dst);
1759
1760                 if (uncached_rt) {
1761                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1762                          * No need for another dst_hold()
1763                          */
1764                         rt6_uncached_list_add(uncached_rt);
1765                         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1766                 } else {
1767                         uncached_rt = net->ipv6.ip6_null_entry;
1768                         dst_hold(&uncached_rt->dst);
1769                 }
1770
1771 uncached_rt_out:
1772                 trace_fib6_table_lookup(net, uncached_rt, table, fl6);
1773                 return uncached_rt;
1774
1775         } else {
1776                 /* Get a percpu copy */
1777
1778                 struct rt6_info *pcpu_rt;
1779
1780                 dst_use_noref(&rt->dst, jiffies);
1781                 local_bh_disable();
1782                 pcpu_rt = rt6_get_pcpu_route(rt);
1783
1784                 if (!pcpu_rt) {
1785                         /* atomic_inc_not_zero() is needed when using rcu */
1786                         if (atomic_inc_not_zero(&rt->rt6i_ref)) {
1787                                 /* No dst_hold() on rt is needed because grabbing
1788                                  * rt->rt6i_ref makes sure rt can't be released.
1789                                  */
1790                                 pcpu_rt = rt6_make_pcpu_route(rt);
1791                                 rt6_release(rt);
1792                         } else {
1793                                 /* rt is already removed from tree */
1794                                 pcpu_rt = net->ipv6.ip6_null_entry;
1795                                 dst_hold(&pcpu_rt->dst);
1796                         }
1797                 }
1798                 local_bh_enable();
1799                 rcu_read_unlock();
1800                 trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
1801                 return pcpu_rt;
1802         }
1803 }
1804 EXPORT_SYMBOL_GPL(ip6_pol_route);
1805
1806 static struct rt6_info *ip6_pol_route_input(struct net *net,
1807                                             struct fib6_table *table,
1808                                             struct flowi6 *fl6,
1809                                             const struct sk_buff *skb,
1810                                             int flags)
1811 {
1812         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1813 }
1814
1815 struct dst_entry *ip6_route_input_lookup(struct net *net,
1816                                          struct net_device *dev,
1817                                          struct flowi6 *fl6,
1818                                          const struct sk_buff *skb,
1819                                          int flags)
1820 {
1821         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1822                 flags |= RT6_LOOKUP_F_IFACE;
1823
1824         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1825 }
1826 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1827
1828 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1829                                   struct flow_keys *keys,
1830                                   struct flow_keys *flkeys)
1831 {
1832         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1833         const struct ipv6hdr *key_iph = outer_iph;
1834         struct flow_keys *_flkeys = flkeys;
1835         const struct ipv6hdr *inner_iph;
1836         const struct icmp6hdr *icmph;
1837         struct ipv6hdr _inner_iph;
1838
1839         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1840                 goto out;
1841
1842         icmph = icmp6_hdr(skb);
1843         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1844             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1845             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1846             icmph->icmp6_type != ICMPV6_PARAMPROB)
1847                 goto out;
1848
1849         inner_iph = skb_header_pointer(skb,
1850                                        skb_transport_offset(skb) + sizeof(*icmph),
1851                                        sizeof(_inner_iph), &_inner_iph);
1852         if (!inner_iph)
1853                 goto out;
1854
1855         key_iph = inner_iph;
1856         _flkeys = NULL;
1857 out:
1858         if (_flkeys) {
1859                 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1860                 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1861                 keys->tags.flow_label = _flkeys->tags.flow_label;
1862                 keys->basic.ip_proto = _flkeys->basic.ip_proto;
1863         } else {
1864                 keys->addrs.v6addrs.src = key_iph->saddr;
1865                 keys->addrs.v6addrs.dst = key_iph->daddr;
1866                 keys->tags.flow_label = ip6_flowinfo(key_iph);
1867                 keys->basic.ip_proto = key_iph->nexthdr;
1868         }
1869 }
1870
1871 /* if skb is set it will be used and fl6 can be NULL */
1872 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1873                        const struct sk_buff *skb, struct flow_keys *flkeys)
1874 {
1875         struct flow_keys hash_keys;
1876         u32 mhash;
1877
1878         switch (ip6_multipath_hash_policy(net)) {
1879         case 0:
1880                 memset(&hash_keys, 0, sizeof(hash_keys));
1881                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1882                 if (skb) {
1883                         ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
1884                 } else {
1885                         hash_keys.addrs.v6addrs.src = fl6->saddr;
1886                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
1887                         hash_keys.tags.flow_label = (__force u32)fl6->flowlabel;
1888                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
1889                 }
1890                 break;
1891         case 1:
1892                 if (skb) {
1893                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1894                         struct flow_keys keys;
1895
1896                         /* short-circuit if we already have L4 hash present */
1897                         if (skb->l4_hash)
1898                                 return skb_get_hash_raw(skb) >> 1;
1899
1900                         memset(&hash_keys, 0, sizeof(hash_keys));
1901
1902                         if (!flkeys) {
1903                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
1904                                 flkeys = &keys;
1905                         }
1906                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1907                         hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
1908                         hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
1909                         hash_keys.ports.src = flkeys->ports.src;
1910                         hash_keys.ports.dst = flkeys->ports.dst;
1911                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1912                 } else {
1913                         memset(&hash_keys, 0, sizeof(hash_keys));
1914                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1915                         hash_keys.addrs.v6addrs.src = fl6->saddr;
1916                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
1917                         hash_keys.ports.src = fl6->fl6_sport;
1918                         hash_keys.ports.dst = fl6->fl6_dport;
1919                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
1920                 }
1921                 break;
1922         }
1923         mhash = flow_hash_from_keys(&hash_keys);
1924
1925         return mhash >> 1;
1926 }
1927
1928 void ip6_route_input(struct sk_buff *skb)
1929 {
1930         const struct ipv6hdr *iph = ipv6_hdr(skb);
1931         struct net *net = dev_net(skb->dev);
1932         int flags = RT6_LOOKUP_F_HAS_SADDR;
1933         struct ip_tunnel_info *tun_info;
1934         struct flowi6 fl6 = {
1935                 .flowi6_iif = skb->dev->ifindex,
1936                 .daddr = iph->daddr,
1937                 .saddr = iph->saddr,
1938                 .flowlabel = ip6_flowinfo(iph),
1939                 .flowi6_mark = skb->mark,
1940                 .flowi6_proto = iph->nexthdr,
1941         };
1942         struct flow_keys *flkeys = NULL, _flkeys;
1943
1944         tun_info = skb_tunnel_info(skb);
1945         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1946                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1947
1948         if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
1949                 flkeys = &_flkeys;
1950
1951         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
1952                 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
1953         skb_dst_drop(skb);
1954         skb_dst_set(skb,
1955                     ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
1956 }
1957
1958 static struct rt6_info *ip6_pol_route_output(struct net *net,
1959                                              struct fib6_table *table,
1960                                              struct flowi6 *fl6,
1961                                              const struct sk_buff *skb,
1962                                              int flags)
1963 {
1964         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
1965 }
1966
1967 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1968                                          struct flowi6 *fl6, int flags)
1969 {
1970         bool any_src;
1971
1972         if (rt6_need_strict(&fl6->daddr)) {
1973                 struct dst_entry *dst;
1974
1975                 dst = l3mdev_link_scope_lookup(net, fl6);
1976                 if (dst)
1977                         return dst;
1978         }
1979
1980         fl6->flowi6_iif = LOOPBACK_IFINDEX;
1981
1982         any_src = ipv6_addr_any(&fl6->saddr);
1983         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1984             (fl6->flowi6_oif && any_src))
1985                 flags |= RT6_LOOKUP_F_IFACE;
1986
1987         if (!any_src)
1988                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1989         else if (sk)
1990                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1991
1992         return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
1993 }
1994 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1995
1996 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1997 {
1998         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1999         struct net_device *loopback_dev = net->loopback_dev;
2000         struct dst_entry *new = NULL;
2001
2002         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2003                        DST_OBSOLETE_DEAD, 0);
2004         if (rt) {
2005                 rt6_info_init(rt);
2006                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2007
2008                 new = &rt->dst;
2009                 new->__use = 1;
2010                 new->input = dst_discard;
2011                 new->output = dst_discard_out;
2012
2013                 dst_copy_metrics(new, &ort->dst);
2014
2015                 rt->rt6i_idev = in6_dev_get(loopback_dev);
2016                 rt->rt6i_gateway = ort->rt6i_gateway;
2017                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2018                 rt->rt6i_metric = 0;
2019
2020                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2021 #ifdef CONFIG_IPV6_SUBTREES
2022                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2023 #endif
2024         }
2025
2026         dst_release(dst_orig);
2027         return new ? new : ERR_PTR(-ENOMEM);
2028 }
2029
2030 /*
2031  *      Destination cache support functions
2032  */
2033
2034 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
2035 {
2036         if (rt->from &&
2037             dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(&rt->from->dst))
2038                 dst_init_metrics(&rt->dst, dst_metrics_ptr(&rt->from->dst), true);
2039 }
2040
2041 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
2042 {
2043         u32 rt_cookie = 0;
2044
2045         if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
2046                 return NULL;
2047
2048         if (rt6_check_expired(rt))
2049                 return NULL;
2050
2051         return &rt->dst;
2052 }
2053
2054 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
2055 {
2056         if (!__rt6_check_expired(rt) &&
2057             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2058             rt6_check(rt->from, cookie))
2059                 return &rt->dst;
2060         else
2061                 return NULL;
2062 }
2063
2064 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2065 {
2066         struct rt6_info *rt;
2067
2068         rt = (struct rt6_info *) dst;
2069
2070         /* All IPV6 dsts are created with ->obsolete set to the value
2071          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2072          * into this function always.
2073          */
2074
2075         rt6_dst_from_metrics_check(rt);
2076
2077         if (rt->rt6i_flags & RTF_PCPU ||
2078             (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from))
2079                 return rt6_dst_from_check(rt, cookie);
2080         else
2081                 return rt6_check(rt, cookie);
2082 }
2083
2084 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2085 {
2086         struct rt6_info *rt = (struct rt6_info *) dst;
2087
2088         if (rt) {
2089                 if (rt->rt6i_flags & RTF_CACHE) {
2090                         if (rt6_check_expired(rt)) {
2091                                 ip6_del_rt(rt);
2092                                 dst = NULL;
2093                         }
2094                 } else {
2095                         dst_release(dst);
2096                         dst = NULL;
2097                 }
2098         }
2099         return dst;
2100 }
2101
2102 static void ip6_link_failure(struct sk_buff *skb)
2103 {
2104         struct rt6_info *rt;
2105
2106         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2107
2108         rt = (struct rt6_info *) skb_dst(skb);
2109         if (rt) {
2110                 if (rt->rt6i_flags & RTF_CACHE) {
2111                         if (dst_hold_safe(&rt->dst))
2112                                 ip6_del_rt(rt);
2113                 } else {
2114                         struct fib6_node *fn;
2115
2116                         rcu_read_lock();
2117                         fn = rcu_dereference(rt->rt6i_node);
2118                         if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2119                                 fn->fn_sernum = -1;
2120                         rcu_read_unlock();
2121                 }
2122         }
2123 }
2124
2125 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2126 {
2127         struct net *net = dev_net(rt->dst.dev);
2128
2129         rt->rt6i_flags |= RTF_MODIFIED;
2130         rt->rt6i_pmtu = mtu;
2131         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2132 }
2133
2134 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2135 {
2136         return !(rt->rt6i_flags & RTF_CACHE) &&
2137                 (rt->rt6i_flags & RTF_PCPU ||
2138                  rcu_access_pointer(rt->rt6i_node));
2139 }
2140
2141 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2142                                  const struct ipv6hdr *iph, u32 mtu)
2143 {
2144         const struct in6_addr *daddr, *saddr;
2145         struct rt6_info *rt6 = (struct rt6_info *)dst;
2146
2147         if (rt6->rt6i_flags & RTF_LOCAL)
2148                 return;
2149
2150         if (dst_metric_locked(dst, RTAX_MTU))
2151                 return;
2152
2153         if (iph) {
2154                 daddr = &iph->daddr;
2155                 saddr = &iph->saddr;
2156         } else if (sk) {
2157                 daddr = &sk->sk_v6_daddr;
2158                 saddr = &inet6_sk(sk)->saddr;
2159         } else {
2160                 daddr = NULL;
2161                 saddr = NULL;
2162         }
2163         dst_confirm_neigh(dst, daddr);
2164         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2165         if (mtu >= dst_mtu(dst))
2166                 return;
2167
2168         if (!rt6_cache_allowed_for_pmtu(rt6)) {
2169                 rt6_do_update_pmtu(rt6, mtu);
2170                 /* update rt6_ex->stamp for cache */
2171                 if (rt6->rt6i_flags & RTF_CACHE)
2172                         rt6_update_exception_stamp_rt(rt6);
2173         } else if (daddr) {
2174                 struct rt6_info *nrt6;
2175
2176                 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
2177                 if (nrt6) {
2178                         rt6_do_update_pmtu(nrt6, mtu);
2179                         if (rt6_insert_exception(nrt6, rt6))
2180                                 dst_release_immediate(&nrt6->dst);
2181                 }
2182         }
2183 }
2184
2185 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2186                                struct sk_buff *skb, u32 mtu)
2187 {
2188         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2189 }
2190
2191 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2192                      int oif, u32 mark, kuid_t uid)
2193 {
2194         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2195         struct dst_entry *dst;
2196         struct flowi6 fl6;
2197
2198         memset(&fl6, 0, sizeof(fl6));
2199         fl6.flowi6_oif = oif;
2200         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2201         fl6.daddr = iph->daddr;
2202         fl6.saddr = iph->saddr;
2203         fl6.flowlabel = ip6_flowinfo(iph);
2204         fl6.flowi6_uid = uid;
2205
2206         dst = ip6_route_output(net, NULL, &fl6);
2207         if (!dst->error)
2208                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2209         dst_release(dst);
2210 }
2211 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2212
2213 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2214 {
2215         struct dst_entry *dst;
2216
2217         ip6_update_pmtu(skb, sock_net(sk), mtu,
2218                         sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2219
2220         dst = __sk_dst_get(sk);
2221         if (!dst || !dst->obsolete ||
2222             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2223                 return;
2224
2225         bh_lock_sock(sk);
2226         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2227                 ip6_datagram_dst_update(sk, false);
2228         bh_unlock_sock(sk);
2229 }
2230 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2231
2232 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2233                            const struct flowi6 *fl6)
2234 {
2235 #ifdef CONFIG_IPV6_SUBTREES
2236         struct ipv6_pinfo *np = inet6_sk(sk);
2237 #endif
2238
2239         ip6_dst_store(sk, dst,
2240                       ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2241                       &sk->sk_v6_daddr : NULL,
2242 #ifdef CONFIG_IPV6_SUBTREES
2243                       ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2244                       &np->saddr :
2245 #endif
2246                       NULL);
2247 }
2248
2249 /* Handle redirects */
2250 struct ip6rd_flowi {
2251         struct flowi6 fl6;
2252         struct in6_addr gateway;
2253 };
2254
2255 static struct rt6_info *__ip6_route_redirect(struct net *net,
2256                                              struct fib6_table *table,
2257                                              struct flowi6 *fl6,
2258                                              const struct sk_buff *skb,
2259                                              int flags)
2260 {
2261         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2262         struct rt6_info *rt, *rt_cache;
2263         struct fib6_node *fn;
2264
2265         /* Get the "current" route for this destination and
2266          * check if the redirect has come from appropriate router.
2267          *
2268          * RFC 4861 specifies that redirects should only be
2269          * accepted if they come from the nexthop to the target.
2270          * Due to the way the routes are chosen, this notion
2271          * is a bit fuzzy and one might need to check all possible
2272          * routes.
2273          */
2274
2275         rcu_read_lock();
2276         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2277 restart:
2278         for_each_fib6_node_rt_rcu(fn) {
2279                 if (rt->rt6i_nh_flags & RTNH_F_DEAD)
2280                         continue;
2281                 if (rt6_check_expired(rt))
2282                         continue;
2283                 if (rt->dst.error)
2284                         break;
2285                 if (!(rt->rt6i_flags & RTF_GATEWAY))
2286                         continue;
2287                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
2288                         continue;
2289                 /* rt_cache's gateway might be different from its 'parent'
2290                  * in the case of an ip redirect.
2291                  * So we keep searching in the exception table if the gateway
2292                  * is different.
2293                  */
2294                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) {
2295                         rt_cache = rt6_find_cached_rt(rt,
2296                                                       &fl6->daddr,
2297                                                       &fl6->saddr);
2298                         if (rt_cache &&
2299                             ipv6_addr_equal(&rdfl->gateway,
2300                                             &rt_cache->rt6i_gateway)) {
2301                                 rt = rt_cache;
2302                                 break;
2303                         }
2304                         continue;
2305                 }
2306                 break;
2307         }
2308
2309         if (!rt)
2310                 rt = net->ipv6.ip6_null_entry;
2311         else if (rt->dst.error) {
2312                 rt = net->ipv6.ip6_null_entry;
2313                 goto out;
2314         }
2315
2316         if (rt == net->ipv6.ip6_null_entry) {
2317                 fn = fib6_backtrack(fn, &fl6->saddr);
2318                 if (fn)
2319                         goto restart;
2320         }
2321
2322 out:
2323         ip6_hold_safe(net, &rt, true);
2324
2325         rcu_read_unlock();
2326
2327         trace_fib6_table_lookup(net, rt, table, fl6);
2328         return rt;
2329 };
2330
2331 static struct dst_entry *ip6_route_redirect(struct net *net,
2332                                             const struct flowi6 *fl6,
2333                                             const struct sk_buff *skb,
2334                                             const struct in6_addr *gateway)
2335 {
2336         int flags = RT6_LOOKUP_F_HAS_SADDR;
2337         struct ip6rd_flowi rdfl;
2338
2339         rdfl.fl6 = *fl6;
2340         rdfl.gateway = *gateway;
2341
2342         return fib6_rule_lookup(net, &rdfl.fl6, skb,
2343                                 flags, __ip6_route_redirect);
2344 }
2345
2346 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2347                   kuid_t uid)
2348 {
2349         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2350         struct dst_entry *dst;
2351         struct flowi6 fl6;
2352
2353         memset(&fl6, 0, sizeof(fl6));
2354         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2355         fl6.flowi6_oif = oif;
2356         fl6.flowi6_mark = mark;
2357         fl6.daddr = iph->daddr;
2358         fl6.saddr = iph->saddr;
2359         fl6.flowlabel = ip6_flowinfo(iph);
2360         fl6.flowi6_uid = uid;
2361
2362         dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2363         rt6_do_redirect(dst, NULL, skb);
2364         dst_release(dst);
2365 }
2366 EXPORT_SYMBOL_GPL(ip6_redirect);
2367
2368 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2369                             u32 mark)
2370 {
2371         const struct ipv6hdr *iph = ipv6_hdr(skb);
2372         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2373         struct dst_entry *dst;
2374         struct flowi6 fl6;
2375
2376         memset(&fl6, 0, sizeof(fl6));
2377         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2378         fl6.flowi6_oif = oif;
2379         fl6.flowi6_mark = mark;
2380         fl6.daddr = msg->dest;
2381         fl6.saddr = iph->daddr;
2382         fl6.flowi6_uid = sock_net_uid(net, NULL);
2383
2384         dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2385         rt6_do_redirect(dst, NULL, skb);
2386         dst_release(dst);
2387 }
2388
2389 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2390 {
2391         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2392                      sk->sk_uid);
2393 }
2394 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2395
2396 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2397 {
2398         struct net_device *dev = dst->dev;
2399         unsigned int mtu = dst_mtu(dst);
2400         struct net *net = dev_net(dev);
2401
2402         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2403
2404         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2405                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2406
2407         /*
2408          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2409          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2410          * IPV6_MAXPLEN is also valid and means: "any MSS,
2411          * rely only on pmtu discovery"
2412          */
2413         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2414                 mtu = IPV6_MAXPLEN;
2415         return mtu;
2416 }
2417
2418 static unsigned int ip6_mtu(const struct dst_entry *dst)
2419 {
2420         const struct rt6_info *rt = (const struct rt6_info *)dst;
2421         unsigned int mtu = rt->rt6i_pmtu;
2422         struct inet6_dev *idev;
2423
2424         if (mtu)
2425                 goto out;
2426
2427         mtu = dst_metric_raw(dst, RTAX_MTU);
2428         if (mtu)
2429                 goto out;
2430
2431         mtu = IPV6_MIN_MTU;
2432
2433         rcu_read_lock();
2434         idev = __in6_dev_get(dst->dev);
2435         if (idev)
2436                 mtu = idev->cnf.mtu6;
2437         rcu_read_unlock();
2438
2439 out:
2440         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2441
2442         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2443 }
2444
2445 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2446                                   struct flowi6 *fl6)
2447 {
2448         struct dst_entry *dst;
2449         struct rt6_info *rt;
2450         struct inet6_dev *idev = in6_dev_get(dev);
2451         struct net *net = dev_net(dev);
2452
2453         if (unlikely(!idev))
2454                 return ERR_PTR(-ENODEV);
2455
2456         rt = ip6_dst_alloc(net, dev, 0);
2457         if (unlikely(!rt)) {
2458                 in6_dev_put(idev);
2459                 dst = ERR_PTR(-ENOMEM);
2460                 goto out;
2461         }
2462
2463         rt->dst.flags |= DST_HOST;
2464         rt->dst.input = ip6_input;
2465         rt->dst.output  = ip6_output;
2466         rt->rt6i_gateway  = fl6->daddr;
2467         rt->rt6i_dst.addr = fl6->daddr;
2468         rt->rt6i_dst.plen = 128;
2469         rt->rt6i_idev     = idev;
2470         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2471
2472         /* Add this dst into uncached_list so that rt6_disable_ip() can
2473          * do proper release of the net_device
2474          */
2475         rt6_uncached_list_add(rt);
2476         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2477
2478         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2479
2480 out:
2481         return dst;
2482 }
2483
2484 static int ip6_dst_gc(struct dst_ops *ops)
2485 {
2486         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2487         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2488         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2489         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2490         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2491         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2492         int entries;
2493
2494         entries = dst_entries_get_fast(ops);
2495         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2496             entries <= rt_max_size)
2497                 goto out;
2498
2499         net->ipv6.ip6_rt_gc_expire++;
2500         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2501         entries = dst_entries_get_slow(ops);
2502         if (entries < ops->gc_thresh)
2503                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2504 out:
2505         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2506         return entries > rt_max_size;
2507 }
2508
2509 static int ip6_convert_metrics(struct mx6_config *mxc,
2510                                const struct fib6_config *cfg)
2511 {
2512         struct net *net = cfg->fc_nlinfo.nl_net;
2513         bool ecn_ca = false;
2514         struct nlattr *nla;
2515         int remaining;
2516         u32 *mp;
2517
2518         if (!cfg->fc_mx)
2519                 return 0;
2520
2521         mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
2522         if (unlikely(!mp))
2523                 return -ENOMEM;
2524
2525         nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
2526                 int type = nla_type(nla);
2527                 u32 val;
2528
2529                 if (!type)
2530                         continue;
2531                 if (unlikely(type > RTAX_MAX))
2532                         goto err;
2533
2534                 if (type == RTAX_CC_ALGO) {
2535                         char tmp[TCP_CA_NAME_MAX];
2536
2537                         nla_strlcpy(tmp, nla, sizeof(tmp));
2538                         val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca);
2539                         if (val == TCP_CA_UNSPEC)
2540                                 goto err;
2541                 } else {
2542                         val = nla_get_u32(nla);
2543                 }
2544                 if (type == RTAX_HOPLIMIT && val > 255)
2545                         val = 255;
2546                 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
2547                         goto err;
2548
2549                 mp[type - 1] = val;
2550                 __set_bit(type - 1, mxc->mx_valid);
2551         }
2552
2553         if (ecn_ca) {
2554                 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
2555                 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
2556         }
2557
2558         mxc->mx = mp;
2559         return 0;
2560  err:
2561         kfree(mp);
2562         return -EINVAL;
2563 }
2564
2565 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2566                                             struct fib6_config *cfg,
2567                                             const struct in6_addr *gw_addr,
2568                                             u32 tbid, int flags)
2569 {
2570         struct flowi6 fl6 = {
2571                 .flowi6_oif = cfg->fc_ifindex,
2572                 .daddr = *gw_addr,
2573                 .saddr = cfg->fc_prefsrc,
2574         };
2575         struct fib6_table *table;
2576         struct rt6_info *rt;
2577
2578         table = fib6_get_table(net, tbid);
2579         if (!table)
2580                 return NULL;
2581
2582         if (!ipv6_addr_any(&cfg->fc_prefsrc))
2583                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2584
2585         flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2586         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2587
2588         /* if table lookup failed, fall back to full lookup */
2589         if (rt == net->ipv6.ip6_null_entry) {
2590                 ip6_rt_put(rt);
2591                 rt = NULL;
2592         }
2593
2594         return rt;
2595 }
2596
2597 static int ip6_route_check_nh_onlink(struct net *net,
2598                                      struct fib6_config *cfg,
2599                                      const struct net_device *dev,
2600                                      struct netlink_ext_ack *extack)
2601 {
2602         u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2603         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2604         u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2605         struct rt6_info *grt;
2606         int err;
2607
2608         err = 0;
2609         grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2610         if (grt) {
2611                 if (!grt->dst.error &&
2612                     (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2613                         NL_SET_ERR_MSG(extack,
2614                                        "Nexthop has invalid gateway or device mismatch");
2615                         err = -EINVAL;
2616                 }
2617
2618                 ip6_rt_put(grt);
2619         }
2620
2621         return err;
2622 }
2623
2624 static int ip6_route_check_nh(struct net *net,
2625                               struct fib6_config *cfg,
2626                               struct net_device **_dev,
2627                               struct inet6_dev **idev)
2628 {
2629         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2630         struct net_device *dev = _dev ? *_dev : NULL;
2631         struct rt6_info *grt = NULL;
2632         int err = -EHOSTUNREACH;
2633
2634         if (cfg->fc_table) {
2635                 int flags = RT6_LOOKUP_F_IFACE;
2636
2637                 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2638                                           cfg->fc_table, flags);
2639                 if (grt) {
2640                         if (grt->rt6i_flags & RTF_GATEWAY ||
2641                             (dev && dev != grt->dst.dev)) {
2642                                 ip6_rt_put(grt);
2643                                 grt = NULL;
2644                         }
2645                 }
2646         }
2647
2648         if (!grt)
2649                 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2650
2651         if (!grt)
2652                 goto out;
2653
2654         if (dev) {
2655                 if (dev != grt->dst.dev) {
2656                         ip6_rt_put(grt);
2657                         goto out;
2658                 }
2659         } else {
2660                 *_dev = dev = grt->dst.dev;
2661                 *idev = grt->rt6i_idev;
2662                 dev_hold(dev);
2663                 in6_dev_hold(grt->rt6i_idev);
2664         }
2665
2666         if (!(grt->rt6i_flags & RTF_GATEWAY))
2667                 err = 0;
2668
2669         ip6_rt_put(grt);
2670
2671 out:
2672         return err;
2673 }
2674
2675 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2676                            struct net_device **_dev, struct inet6_dev **idev,
2677                            struct netlink_ext_ack *extack)
2678 {
2679         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2680         int gwa_type = ipv6_addr_type(gw_addr);
2681         bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2682         const struct net_device *dev = *_dev;
2683         bool need_addr_check = !dev;
2684         int err = -EINVAL;
2685
2686         /* if gw_addr is local we will fail to detect this in case
2687          * address is still TENTATIVE (DAD in progress). rt6_lookup()
2688          * will return already-added prefix route via interface that
2689          * prefix route was assigned to, which might be non-loopback.
2690          */
2691         if (dev &&
2692             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2693                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2694                 goto out;
2695         }
2696
2697         if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2698                 /* IPv6 strictly inhibits using not link-local
2699                  * addresses as nexthop address.
2700                  * Otherwise, router will not able to send redirects.
2701                  * It is very good, but in some (rare!) circumstances
2702                  * (SIT, PtP, NBMA NOARP links) it is handy to allow
2703                  * some exceptions. --ANK
2704                  * We allow IPv4-mapped nexthops to support RFC4798-type
2705                  * addressing
2706                  */
2707                 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2708                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2709                         goto out;
2710                 }
2711
2712                 if (cfg->fc_flags & RTNH_F_ONLINK)
2713                         err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2714                 else
2715                         err = ip6_route_check_nh(net, cfg, _dev, idev);
2716
2717                 if (err)
2718                         goto out;
2719         }
2720
2721         /* reload in case device was changed */
2722         dev = *_dev;
2723
2724         err = -EINVAL;
2725         if (!dev) {
2726                 NL_SET_ERR_MSG(extack, "Egress device not specified");
2727                 goto out;
2728         } else if (dev->flags & IFF_LOOPBACK) {
2729                 NL_SET_ERR_MSG(extack,
2730                                "Egress device can not be loopback device for this route");
2731                 goto out;
2732         }
2733
2734         /* if we did not check gw_addr above, do so now that the
2735          * egress device has been resolved.
2736          */
2737         if (need_addr_check &&
2738             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2739                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2740                 goto out;
2741         }
2742
2743         err = 0;
2744 out:
2745         return err;
2746 }
2747
2748 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
2749                                               struct netlink_ext_ack *extack)
2750 {
2751         struct net *net = cfg->fc_nlinfo.nl_net;
2752         struct rt6_info *rt = NULL;
2753         struct net_device *dev = NULL;
2754         struct inet6_dev *idev = NULL;
2755         struct fib6_table *table;
2756         int addr_type;
2757         int err = -EINVAL;
2758
2759         /* RTF_PCPU is an internal flag; can not be set by userspace */
2760         if (cfg->fc_flags & RTF_PCPU) {
2761                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2762                 goto out;
2763         }
2764
2765         /* RTF_CACHE is an internal flag; can not be set by userspace */
2766         if (cfg->fc_flags & RTF_CACHE) {
2767                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2768                 goto out;
2769         }
2770
2771         if (cfg->fc_dst_len > 128) {
2772                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2773                 goto out;
2774         }
2775         if (cfg->fc_src_len > 128) {
2776                 NL_SET_ERR_MSG(extack, "Invalid source address length");
2777                 goto out;
2778         }
2779 #ifndef CONFIG_IPV6_SUBTREES
2780         if (cfg->fc_src_len) {
2781                 NL_SET_ERR_MSG(extack,
2782                                "Specifying source address requires IPV6_SUBTREES to be enabled");
2783                 goto out;
2784         }
2785 #endif
2786         if (cfg->fc_ifindex) {
2787                 err = -ENODEV;
2788                 dev = dev_get_by_index(net, cfg->fc_ifindex);
2789                 if (!dev)
2790                         goto out;
2791                 idev = in6_dev_get(dev);
2792                 if (!idev)
2793                         goto out;
2794         }
2795
2796         if (cfg->fc_metric == 0)
2797                 cfg->fc_metric = IP6_RT_PRIO_USER;
2798
2799         if (cfg->fc_flags & RTNH_F_ONLINK) {
2800                 if (!dev) {
2801                         NL_SET_ERR_MSG(extack,
2802                                        "Nexthop device required for onlink");
2803                         err = -ENODEV;
2804                         goto out;
2805                 }
2806
2807                 if (!(dev->flags & IFF_UP)) {
2808                         NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2809                         err = -ENETDOWN;
2810                         goto out;
2811                 }
2812         }
2813
2814         err = -ENOBUFS;
2815         if (cfg->fc_nlinfo.nlh &&
2816             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2817                 table = fib6_get_table(net, cfg->fc_table);
2818                 if (!table) {
2819                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2820                         table = fib6_new_table(net, cfg->fc_table);
2821                 }
2822         } else {
2823                 table = fib6_new_table(net, cfg->fc_table);
2824         }
2825
2826         if (!table)
2827                 goto out;
2828
2829         rt = ip6_dst_alloc(net, NULL,
2830                            (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
2831
2832         if (!rt) {
2833                 err = -ENOMEM;
2834                 goto out;
2835         }
2836
2837         if (cfg->fc_flags & RTF_EXPIRES)
2838                 rt6_set_expires(rt, jiffies +
2839                                 clock_t_to_jiffies(cfg->fc_expires));
2840         else
2841                 rt6_clean_expires(rt);
2842
2843         if (cfg->fc_protocol == RTPROT_UNSPEC)
2844                 cfg->fc_protocol = RTPROT_BOOT;
2845         rt->rt6i_protocol = cfg->fc_protocol;
2846
2847         addr_type = ipv6_addr_type(&cfg->fc_dst);
2848
2849         if (addr_type & IPV6_ADDR_MULTICAST)
2850                 rt->dst.input = ip6_mc_input;
2851         else if (cfg->fc_flags & RTF_LOCAL)
2852                 rt->dst.input = ip6_input;
2853         else
2854                 rt->dst.input = ip6_forward;
2855
2856         rt->dst.output = ip6_output;
2857
2858         if (cfg->fc_encap) {
2859                 struct lwtunnel_state *lwtstate;
2860
2861                 err = lwtunnel_build_state(cfg->fc_encap_type,
2862                                            cfg->fc_encap, AF_INET6, cfg,
2863                                            &lwtstate, extack);
2864                 if (err)
2865                         goto out;
2866                 rt->dst.lwtstate = lwtstate_get(lwtstate);
2867                 lwtunnel_set_redirect(&rt->dst);
2868         }
2869
2870         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2871         rt->rt6i_dst.plen = cfg->fc_dst_len;
2872         if (rt->rt6i_dst.plen == 128)
2873                 rt->dst.flags |= DST_HOST;
2874
2875 #ifdef CONFIG_IPV6_SUBTREES
2876         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
2877         rt->rt6i_src.plen = cfg->fc_src_len;
2878 #endif
2879
2880         rt->rt6i_metric = cfg->fc_metric;
2881         rt->rt6i_nh_weight = 1;
2882
2883         /* We cannot add true routes via loopback here,
2884            they would result in kernel looping; promote them to reject routes
2885          */
2886         if ((cfg->fc_flags & RTF_REJECT) ||
2887             (dev && (dev->flags & IFF_LOOPBACK) &&
2888              !(addr_type & IPV6_ADDR_LOOPBACK) &&
2889              !(cfg->fc_flags & RTF_LOCAL))) {
2890                 /* hold loopback dev/idev if we haven't done so. */
2891                 if (dev != net->loopback_dev) {
2892                         if (dev) {
2893                                 dev_put(dev);
2894                                 in6_dev_put(idev);
2895                         }
2896                         dev = net->loopback_dev;
2897                         dev_hold(dev);
2898                         idev = in6_dev_get(dev);
2899                         if (!idev) {
2900                                 err = -ENODEV;
2901                                 goto out;
2902                         }
2903                 }
2904                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
2905                 switch (cfg->fc_type) {
2906                 case RTN_BLACKHOLE:
2907                         rt->dst.error = -EINVAL;
2908                         rt->dst.output = dst_discard_out;
2909                         rt->dst.input = dst_discard;
2910                         break;
2911                 case RTN_PROHIBIT:
2912                         rt->dst.error = -EACCES;
2913                         rt->dst.output = ip6_pkt_prohibit_out;
2914                         rt->dst.input = ip6_pkt_prohibit;
2915                         break;
2916                 case RTN_THROW:
2917                 case RTN_UNREACHABLE:
2918                 default:
2919                         rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
2920                                         : (cfg->fc_type == RTN_UNREACHABLE)
2921                                         ? -EHOSTUNREACH : -ENETUNREACH;
2922                         rt->dst.output = ip6_pkt_discard_out;
2923                         rt->dst.input = ip6_pkt_discard;
2924                         break;
2925                 }
2926                 goto install_route;
2927         }
2928
2929         if (cfg->fc_flags & RTF_GATEWAY) {
2930                 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
2931                 if (err)
2932                         goto out;
2933
2934                 rt->rt6i_gateway = cfg->fc_gateway;
2935         }
2936
2937         err = -ENODEV;
2938         if (!dev)
2939                 goto out;
2940
2941         if (idev->cnf.disable_ipv6) {
2942                 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
2943                 err = -EACCES;
2944                 goto out;
2945         }
2946
2947         if (!(dev->flags & IFF_UP)) {
2948                 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2949                 err = -ENETDOWN;
2950                 goto out;
2951         }
2952
2953         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2954                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2955                         NL_SET_ERR_MSG(extack, "Invalid source address");
2956                         err = -EINVAL;
2957                         goto out;
2958                 }
2959                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2960                 rt->rt6i_prefsrc.plen = 128;
2961         } else
2962                 rt->rt6i_prefsrc.plen = 0;
2963
2964         rt->rt6i_flags = cfg->fc_flags;
2965
2966 install_route:
2967         if (!(rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
2968             !netif_carrier_ok(dev))
2969                 rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
2970         rt->rt6i_nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
2971         rt->dst.dev = dev;
2972         rt->rt6i_idev = idev;
2973         rt->rt6i_table = table;
2974
2975         cfg->fc_nlinfo.nl_net = dev_net(dev);
2976
2977         return rt;
2978 out:
2979         if (dev)
2980                 dev_put(dev);
2981         if (idev)
2982                 in6_dev_put(idev);
2983         if (rt)
2984                 dst_release_immediate(&rt->dst);
2985
2986         return ERR_PTR(err);
2987 }
2988
2989 int ip6_route_add(struct fib6_config *cfg,
2990                   struct netlink_ext_ack *extack)
2991 {
2992         struct mx6_config mxc = { .mx = NULL, };
2993         struct rt6_info *rt;
2994         int err;
2995
2996         rt = ip6_route_info_create(cfg, extack);
2997         if (IS_ERR(rt)) {
2998                 err = PTR_ERR(rt);
2999                 rt = NULL;
3000                 goto out;
3001         }
3002
3003         err = ip6_convert_metrics(&mxc, cfg);
3004         if (err)
3005                 goto out;
3006
3007         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
3008
3009         kfree(mxc.mx);
3010
3011         return err;
3012 out:
3013         if (rt)
3014                 dst_release_immediate(&rt->dst);
3015
3016         return err;
3017 }
3018
3019 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
3020 {
3021         int err;
3022         struct fib6_table *table;
3023         struct net *net = dev_net(rt->dst.dev);
3024
3025         if (rt == net->ipv6.ip6_null_entry) {
3026                 err = -ENOENT;
3027                 goto out;
3028         }
3029
3030         table = rt->rt6i_table;
3031         spin_lock_bh(&table->tb6_lock);
3032         err = fib6_del(rt, info);
3033         spin_unlock_bh(&table->tb6_lock);
3034
3035 out:
3036         ip6_rt_put(rt);
3037         return err;
3038 }
3039
3040 int ip6_del_rt(struct rt6_info *rt)
3041 {
3042         struct nl_info info = {
3043                 .nl_net = dev_net(rt->dst.dev),
3044         };
3045         return __ip6_del_rt(rt, &info);
3046 }
3047
3048 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
3049 {
3050         struct nl_info *info = &cfg->fc_nlinfo;
3051         struct net *net = info->nl_net;
3052         struct sk_buff *skb = NULL;
3053         struct fib6_table *table;
3054         int err = -ENOENT;
3055
3056         if (rt == net->ipv6.ip6_null_entry)
3057                 goto out_put;
3058         table = rt->rt6i_table;
3059         spin_lock_bh(&table->tb6_lock);
3060
3061         if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
3062                 struct rt6_info *sibling, *next_sibling;
3063
3064                 /* prefer to send a single notification with all hops */
3065                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3066                 if (skb) {
3067                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3068
3069                         if (rt6_fill_node(net, skb, rt,
3070                                           NULL, NULL, 0, RTM_DELROUTE,
3071                                           info->portid, seq, 0) < 0) {
3072                                 kfree_skb(skb);
3073                                 skb = NULL;
3074                         } else
3075                                 info->skip_notify = 1;
3076                 }
3077
3078                 list_for_each_entry_safe(sibling, next_sibling,
3079                                          &rt->rt6i_siblings,
3080                                          rt6i_siblings) {
3081                         err = fib6_del(sibling, info);
3082                         if (err)
3083                                 goto out_unlock;
3084                 }
3085         }
3086
3087         err = fib6_del(rt, info);
3088 out_unlock:
3089         spin_unlock_bh(&table->tb6_lock);
3090 out_put:
3091         ip6_rt_put(rt);
3092
3093         if (skb) {
3094                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3095                             info->nlh, gfp_any());
3096         }
3097         return err;
3098 }
3099
3100 static int ip6_route_del(struct fib6_config *cfg,
3101                          struct netlink_ext_ack *extack)
3102 {
3103         struct rt6_info *rt, *rt_cache;
3104         struct fib6_table *table;
3105         struct fib6_node *fn;
3106         int err = -ESRCH;
3107
3108         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3109         if (!table) {
3110                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3111                 return err;
3112         }
3113
3114         rcu_read_lock();
3115
3116         fn = fib6_locate(&table->tb6_root,
3117                          &cfg->fc_dst, cfg->fc_dst_len,
3118                          &cfg->fc_src, cfg->fc_src_len,
3119                          !(cfg->fc_flags & RTF_CACHE));
3120
3121         if (fn) {
3122                 for_each_fib6_node_rt_rcu(fn) {
3123                         if (cfg->fc_flags & RTF_CACHE) {
3124                                 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3125                                                               &cfg->fc_src);
3126                                 if (!rt_cache)
3127                                         continue;
3128                                 rt = rt_cache;
3129                         }
3130                         if (cfg->fc_ifindex &&
3131                             (!rt->dst.dev ||
3132                              rt->dst.dev->ifindex != cfg->fc_ifindex))
3133                                 continue;
3134                         if (cfg->fc_flags & RTF_GATEWAY &&
3135                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3136                                 continue;
3137                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
3138                                 continue;
3139                         if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
3140                                 continue;
3141                         if (!dst_hold_safe(&rt->dst))
3142                                 break;
3143                         rcu_read_unlock();
3144
3145                         /* if gateway was specified only delete the one hop */
3146                         if (cfg->fc_flags & RTF_GATEWAY)
3147                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3148
3149                         return __ip6_del_rt_siblings(rt, cfg);
3150                 }
3151         }
3152         rcu_read_unlock();
3153
3154         return err;
3155 }
3156
3157 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3158 {
3159         struct netevent_redirect netevent;
3160         struct rt6_info *rt, *nrt = NULL;
3161         struct ndisc_options ndopts;
3162         struct inet6_dev *in6_dev;
3163         struct neighbour *neigh;
3164         struct rd_msg *msg;
3165         int optlen, on_link;
3166         u8 *lladdr;
3167
3168         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3169         optlen -= sizeof(*msg);
3170
3171         if (optlen < 0) {
3172                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3173                 return;
3174         }
3175
3176         msg = (struct rd_msg *)icmp6_hdr(skb);
3177
3178         if (ipv6_addr_is_multicast(&msg->dest)) {
3179                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3180                 return;
3181         }
3182
3183         on_link = 0;
3184         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3185                 on_link = 1;
3186         } else if (ipv6_addr_type(&msg->target) !=
3187                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3188                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3189                 return;
3190         }
3191
3192         in6_dev = __in6_dev_get(skb->dev);
3193         if (!in6_dev)
3194                 return;
3195         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3196                 return;
3197
3198         /* RFC2461 8.1:
3199          *      The IP source address of the Redirect MUST be the same as the current
3200          *      first-hop router for the specified ICMP Destination Address.
3201          */
3202
3203         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3204                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3205                 return;
3206         }
3207
3208         lladdr = NULL;
3209         if (ndopts.nd_opts_tgt_lladdr) {
3210                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3211                                              skb->dev);
3212                 if (!lladdr) {
3213                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3214                         return;
3215                 }
3216         }
3217
3218         rt = (struct rt6_info *) dst;
3219         if (rt->rt6i_flags & RTF_REJECT) {
3220                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3221                 return;
3222         }
3223
3224         /* Redirect received -> path was valid.
3225          * Look, redirects are sent only in response to data packets,
3226          * so that this nexthop apparently is reachable. --ANK
3227          */
3228         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3229
3230         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3231         if (!neigh)
3232                 return;
3233
3234         /*
3235          *      We have finally decided to accept it.
3236          */
3237
3238         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3239                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
3240                      NEIGH_UPDATE_F_OVERRIDE|
3241                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3242                                      NEIGH_UPDATE_F_ISROUTER)),
3243                      NDISC_REDIRECT, &ndopts);
3244
3245         nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
3246         if (!nrt)
3247                 goto out;
3248
3249         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3250         if (on_link)
3251                 nrt->rt6i_flags &= ~RTF_GATEWAY;
3252
3253         nrt->rt6i_protocol = RTPROT_REDIRECT;
3254         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3255
3256         /* No need to remove rt from the exception table if rt is
3257          * a cached route because rt6_insert_exception() will
3258          * takes care of it
3259          */
3260         if (rt6_insert_exception(nrt, rt)) {
3261                 dst_release_immediate(&nrt->dst);
3262                 goto out;
3263         }
3264
3265         netevent.old = &rt->dst;
3266         netevent.new = &nrt->dst;
3267         netevent.daddr = &msg->dest;
3268         netevent.neigh = neigh;
3269         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3270
3271 out:
3272         neigh_release(neigh);
3273 }
3274
3275 /*
3276  *      Misc support functions
3277  */
3278
3279 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
3280 {
3281         BUG_ON(from->from);
3282
3283         rt->rt6i_flags &= ~RTF_EXPIRES;
3284         dst_hold(&from->dst);
3285         rt->from = from;
3286         dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
3287 }
3288
3289 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
3290 {
3291         rt->dst.input = ort->dst.input;
3292         rt->dst.output = ort->dst.output;
3293         rt->rt6i_dst = ort->rt6i_dst;
3294         rt->dst.error = ort->dst.error;
3295         rt->rt6i_idev = ort->rt6i_idev;
3296         if (rt->rt6i_idev)
3297                 in6_dev_hold(rt->rt6i_idev);
3298         rt->dst.lastuse = jiffies;
3299         rt->rt6i_gateway = ort->rt6i_gateway;
3300         rt->rt6i_flags = ort->rt6i_flags;
3301         rt6_set_from(rt, ort);
3302         rt->rt6i_metric = ort->rt6i_metric;
3303 #ifdef CONFIG_IPV6_SUBTREES
3304         rt->rt6i_src = ort->rt6i_src;
3305 #endif
3306         rt->rt6i_prefsrc = ort->rt6i_prefsrc;
3307         rt->rt6i_table = ort->rt6i_table;
3308         rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
3309 }
3310
3311 #ifdef CONFIG_IPV6_ROUTE_INFO
3312 static struct rt6_info *rt6_get_route_info(struct net *net,
3313                                            const struct in6_addr *prefix, int prefixlen,
3314                                            const struct in6_addr *gwaddr,
3315                                            struct net_device *dev)
3316 {
3317         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3318         int ifindex = dev->ifindex;
3319         struct fib6_node *fn;
3320         struct rt6_info *rt = NULL;
3321         struct fib6_table *table;
3322
3323         table = fib6_get_table(net, tb_id);
3324         if (!table)
3325                 return NULL;
3326
3327         rcu_read_lock();
3328         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3329         if (!fn)
3330                 goto out;
3331
3332         for_each_fib6_node_rt_rcu(fn) {
3333                 if (rt->dst.dev->ifindex != ifindex)
3334                         continue;
3335                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3336                         continue;
3337                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
3338                         continue;
3339                 ip6_hold_safe(NULL, &rt, false);
3340                 break;
3341         }
3342 out:
3343         rcu_read_unlock();
3344         return rt;
3345 }
3346
3347 static struct rt6_info *rt6_add_route_info(struct net *net,
3348                                            const struct in6_addr *prefix, int prefixlen,
3349                                            const struct in6_addr *gwaddr,
3350                                            struct net_device *dev,
3351                                            unsigned int pref)
3352 {
3353         struct fib6_config cfg = {
3354                 .fc_metric      = IP6_RT_PRIO_USER,
3355                 .fc_ifindex     = dev->ifindex,
3356                 .fc_dst_len     = prefixlen,
3357                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3358                                   RTF_UP | RTF_PREF(pref),
3359                 .fc_protocol = RTPROT_RA,
3360                 .fc_nlinfo.portid = 0,
3361                 .fc_nlinfo.nlh = NULL,
3362                 .fc_nlinfo.nl_net = net,
3363         };
3364
3365         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3366         cfg.fc_dst = *prefix;
3367         cfg.fc_gateway = *gwaddr;
3368
3369         /* We should treat it as a default route if prefix length is 0. */
3370         if (!prefixlen)
3371                 cfg.fc_flags |= RTF_DEFAULT;
3372
3373         ip6_route_add(&cfg, NULL);
3374
3375         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3376 }
3377 #endif
3378
3379 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
3380 {
3381         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3382         struct rt6_info *rt;
3383         struct fib6_table *table;
3384
3385         table = fib6_get_table(dev_net(dev), tb_id);
3386         if (!table)
3387                 return NULL;
3388
3389         rcu_read_lock();
3390         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3391                 if (dev == rt->dst.dev &&
3392                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3393                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
3394                         break;
3395         }
3396         if (rt)
3397                 ip6_hold_safe(NULL, &rt, false);
3398         rcu_read_unlock();
3399         return rt;
3400 }
3401
3402 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
3403                                      struct net_device *dev,
3404                                      unsigned int pref)
3405 {
3406         struct fib6_config cfg = {
3407                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3408                 .fc_metric      = IP6_RT_PRIO_USER,
3409                 .fc_ifindex     = dev->ifindex,
3410                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3411                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3412                 .fc_protocol = RTPROT_RA,
3413                 .fc_nlinfo.portid = 0,
3414                 .fc_nlinfo.nlh = NULL,
3415                 .fc_nlinfo.nl_net = dev_net(dev),
3416         };
3417
3418         cfg.fc_gateway = *gwaddr;
3419
3420         if (!ip6_route_add(&cfg, NULL)) {
3421                 struct fib6_table *table;
3422
3423                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3424                 if (table)
3425                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3426         }
3427
3428         return rt6_get_dflt_router(gwaddr, dev);
3429 }
3430
3431 static void __rt6_purge_dflt_routers(struct fib6_table *table)
3432 {
3433         struct rt6_info *rt;
3434
3435 restart:
3436         rcu_read_lock();
3437         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3438                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3439                     (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
3440                         if (dst_hold_safe(&rt->dst)) {
3441                                 rcu_read_unlock();
3442                                 ip6_del_rt(rt);
3443                         } else {
3444                                 rcu_read_unlock();
3445                         }
3446                         goto restart;
3447                 }
3448         }
3449         rcu_read_unlock();
3450
3451         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3452 }
3453
3454 void rt6_purge_dflt_routers(struct net *net)
3455 {
3456         struct fib6_table *table;
3457         struct hlist_head *head;
3458         unsigned int h;
3459
3460         rcu_read_lock();
3461
3462         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3463                 head = &net->ipv6.fib_table_hash[h];
3464                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3465                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3466                                 __rt6_purge_dflt_routers(table);
3467                 }
3468         }
3469
3470         rcu_read_unlock();
3471 }
3472
3473 static void rtmsg_to_fib6_config(struct net *net,
3474                                  struct in6_rtmsg *rtmsg,
3475                                  struct fib6_config *cfg)
3476 {
3477         memset(cfg, 0, sizeof(*cfg));
3478
3479         cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3480                          : RT6_TABLE_MAIN;
3481         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3482         cfg->fc_metric = rtmsg->rtmsg_metric;
3483         cfg->fc_expires = rtmsg->rtmsg_info;
3484         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3485         cfg->fc_src_len = rtmsg->rtmsg_src_len;
3486         cfg->fc_flags = rtmsg->rtmsg_flags;
3487
3488         cfg->fc_nlinfo.nl_net = net;
3489
3490         cfg->fc_dst = rtmsg->rtmsg_dst;
3491         cfg->fc_src = rtmsg->rtmsg_src;
3492         cfg->fc_gateway = rtmsg->rtmsg_gateway;
3493 }
3494
3495 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3496 {
3497         struct fib6_config cfg;
3498         struct in6_rtmsg rtmsg;
3499         int err;
3500
3501         switch (cmd) {
3502         case SIOCADDRT:         /* Add a route */
3503         case SIOCDELRT:         /* Delete a route */
3504                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3505                         return -EPERM;
3506                 err = copy_from_user(&rtmsg, arg,
3507                                      sizeof(struct in6_rtmsg));
3508                 if (err)
3509                         return -EFAULT;
3510
3511                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3512
3513                 rtnl_lock();
3514                 switch (cmd) {
3515                 case SIOCADDRT:
3516                         err = ip6_route_add(&cfg, NULL);
3517                         break;
3518                 case SIOCDELRT:
3519                         err = ip6_route_del(&cfg, NULL);
3520                         break;
3521                 default:
3522                         err = -EINVAL;
3523                 }
3524                 rtnl_unlock();
3525
3526                 return err;
3527         }
3528
3529         return -EINVAL;
3530 }
3531
3532 /*
3533  *      Drop the packet on the floor
3534  */
3535
3536 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3537 {
3538         int type;
3539         struct dst_entry *dst = skb_dst(skb);
3540         switch (ipstats_mib_noroutes) {
3541         case IPSTATS_MIB_INNOROUTES:
3542                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3543                 if (type == IPV6_ADDR_ANY) {
3544                         IP6_INC_STATS(dev_net(dst->dev),
3545                                       __in6_dev_get_safely(skb->dev),
3546                                       IPSTATS_MIB_INADDRERRORS);
3547                         break;
3548                 }
3549                 /* FALLTHROUGH */
3550         case IPSTATS_MIB_OUTNOROUTES:
3551                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3552                               ipstats_mib_noroutes);
3553                 break;
3554         }
3555         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3556         kfree_skb(skb);
3557         return 0;
3558 }
3559
3560 static int ip6_pkt_discard(struct sk_buff *skb)
3561 {
3562         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3563 }
3564
3565 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3566 {
3567         skb->dev = skb_dst(skb)->dev;
3568         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3569 }
3570
3571 static int ip6_pkt_prohibit(struct sk_buff *skb)
3572 {
3573         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3574 }
3575
3576 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3577 {
3578         skb->dev = skb_dst(skb)->dev;
3579         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3580 }
3581
3582 /*
3583  *      Allocate a dst for local (unicast / anycast) address.
3584  */
3585
3586 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
3587                                     const struct in6_addr *addr,
3588                                     bool anycast)
3589 {
3590         u32 tb_id;
3591         struct net *net = dev_net(idev->dev);
3592         struct net_device *dev = idev->dev;
3593         struct rt6_info *rt;
3594
3595         rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
3596         if (!rt)
3597                 return ERR_PTR(-ENOMEM);
3598
3599         in6_dev_hold(idev);
3600
3601         rt->dst.flags |= DST_HOST;
3602         rt->dst.input = ip6_input;
3603         rt->dst.output = ip6_output;
3604         rt->rt6i_idev = idev;
3605
3606         rt->rt6i_protocol = RTPROT_KERNEL;
3607         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
3608         if (anycast)
3609                 rt->rt6i_flags |= RTF_ANYCAST;
3610         else
3611                 rt->rt6i_flags |= RTF_LOCAL;
3612
3613         rt->rt6i_gateway  = *addr;
3614         rt->rt6i_dst.addr = *addr;
3615         rt->rt6i_dst.plen = 128;
3616         tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3617         rt->rt6i_table = fib6_get_table(net, tb_id);
3618
3619         return rt;
3620 }
3621
3622 /* remove deleted ip from prefsrc entries */
3623 struct arg_dev_net_ip {
3624         struct net_device *dev;
3625         struct net *net;
3626         struct in6_addr *addr;
3627 };
3628
3629 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
3630 {
3631         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3632         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3633         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3634
3635         if (((void *)rt->dst.dev == dev || !dev) &&
3636             rt != net->ipv6.ip6_null_entry &&
3637             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
3638                 spin_lock_bh(&rt6_exception_lock);
3639                 /* remove prefsrc entry */
3640                 rt->rt6i_prefsrc.plen = 0;
3641                 /* need to update cache as well */
3642                 rt6_exceptions_remove_prefsrc(rt);
3643                 spin_unlock_bh(&rt6_exception_lock);
3644         }
3645         return 0;
3646 }
3647
3648 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3649 {
3650         struct net *net = dev_net(ifp->idev->dev);
3651         struct arg_dev_net_ip adni = {
3652                 .dev = ifp->idev->dev,
3653                 .net = net,
3654                 .addr = &ifp->addr,
3655         };
3656         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3657 }
3658
3659 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3660
3661 /* Remove routers and update dst entries when gateway turn into host. */
3662 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
3663 {
3664         struct in6_addr *gateway = (struct in6_addr *)arg;
3665
3666         if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3667             ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
3668                 return -1;
3669         }
3670
3671         /* Further clean up cached routes in exception table.
3672          * This is needed because cached route may have a different
3673          * gateway than its 'parent' in the case of an ip redirect.
3674          */
3675         rt6_exceptions_clean_tohost(rt, gateway);
3676
3677         return 0;
3678 }
3679
3680 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3681 {
3682         fib6_clean_all(net, fib6_clean_tohost, gateway);
3683 }
3684
3685 struct arg_netdev_event {
3686         const struct net_device *dev;
3687         union {
3688                 unsigned int nh_flags;
3689                 unsigned long event;
3690         };
3691 };
3692
3693 static struct rt6_info *rt6_multipath_first_sibling(const struct rt6_info *rt)
3694 {
3695         struct rt6_info *iter;
3696         struct fib6_node *fn;
3697
3698         fn = rcu_dereference_protected(rt->rt6i_node,
3699                         lockdep_is_held(&rt->rt6i_table->tb6_lock));
3700         iter = rcu_dereference_protected(fn->leaf,
3701                         lockdep_is_held(&rt->rt6i_table->tb6_lock));
3702         while (iter) {
3703                 if (iter->rt6i_metric == rt->rt6i_metric &&
3704                     rt6_qualify_for_ecmp(iter))
3705                         return iter;
3706                 iter = rcu_dereference_protected(iter->rt6_next,
3707                                 lockdep_is_held(&rt->rt6i_table->tb6_lock));
3708         }
3709
3710         return NULL;
3711 }
3712
3713 static bool rt6_is_dead(const struct rt6_info *rt)
3714 {
3715         if (rt->rt6i_nh_flags & RTNH_F_DEAD ||
3716             (rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
3717              rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
3718                 return true;
3719
3720         return false;
3721 }
3722
3723 static int rt6_multipath_total_weight(const struct rt6_info *rt)
3724 {
3725         struct rt6_info *iter;
3726         int total = 0;
3727
3728         if (!rt6_is_dead(rt))
3729                 total += rt->rt6i_nh_weight;
3730
3731         list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) {
3732                 if (!rt6_is_dead(iter))
3733                         total += iter->rt6i_nh_weight;
3734         }
3735
3736         return total;
3737 }
3738
3739 static void rt6_upper_bound_set(struct rt6_info *rt, int *weight, int total)
3740 {
3741         int upper_bound = -1;
3742
3743         if (!rt6_is_dead(rt)) {
3744                 *weight += rt->rt6i_nh_weight;
3745                 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3746                                                     total) - 1;
3747         }
3748         atomic_set(&rt->rt6i_nh_upper_bound, upper_bound);
3749 }
3750
3751 static void rt6_multipath_upper_bound_set(struct rt6_info *rt, int total)
3752 {
3753         struct rt6_info *iter;
3754         int weight = 0;
3755
3756         rt6_upper_bound_set(rt, &weight, total);
3757
3758         list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3759                 rt6_upper_bound_set(iter, &weight, total);
3760 }
3761
3762 void rt6_multipath_rebalance(struct rt6_info *rt)
3763 {
3764         struct rt6_info *first;
3765         int total;
3766
3767         /* In case the entire multipath route was marked for flushing,
3768          * then there is no need to rebalance upon the removal of every
3769          * sibling route.
3770          */
3771         if (!rt->rt6i_nsiblings || rt->should_flush)
3772                 return;
3773
3774         /* During lookup routes are evaluated in order, so we need to
3775          * make sure upper bounds are assigned from the first sibling
3776          * onwards.
3777          */
3778         first = rt6_multipath_first_sibling(rt);
3779         if (WARN_ON_ONCE(!first))
3780                 return;
3781
3782         total = rt6_multipath_total_weight(first);
3783         rt6_multipath_upper_bound_set(first, total);
3784 }
3785
3786 static int fib6_ifup(struct rt6_info *rt, void *p_arg)
3787 {
3788         const struct arg_netdev_event *arg = p_arg;
3789         struct net *net = dev_net(arg->dev);
3790
3791         if (rt != net->ipv6.ip6_null_entry && rt->dst.dev == arg->dev) {
3792                 rt->rt6i_nh_flags &= ~arg->nh_flags;
3793                 fib6_update_sernum_upto_root(net, rt);
3794                 rt6_multipath_rebalance(rt);
3795         }
3796
3797         return 0;
3798 }
3799
3800 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3801 {
3802         struct arg_netdev_event arg = {
3803                 .dev = dev,
3804                 {
3805                         .nh_flags = nh_flags,
3806                 },
3807         };
3808
3809         if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3810                 arg.nh_flags |= RTNH_F_LINKDOWN;
3811
3812         fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3813 }
3814
3815 static bool rt6_multipath_uses_dev(const struct rt6_info *rt,
3816                                    const struct net_device *dev)
3817 {
3818         struct rt6_info *iter;
3819
3820         if (rt->dst.dev == dev)
3821                 return true;
3822         list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3823                 if (iter->dst.dev == dev)
3824                         return true;
3825
3826         return false;
3827 }
3828
3829 static void rt6_multipath_flush(struct rt6_info *rt)
3830 {
3831         struct rt6_info *iter;
3832
3833         rt->should_flush = 1;
3834         list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3835                 iter->should_flush = 1;
3836 }
3837
3838 static unsigned int rt6_multipath_dead_count(const struct rt6_info *rt,
3839                                              const struct net_device *down_dev)
3840 {
3841         struct rt6_info *iter;
3842         unsigned int dead = 0;
3843
3844         if (rt->dst.dev == down_dev || rt->rt6i_nh_flags & RTNH_F_DEAD)
3845                 dead++;
3846         list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3847                 if (iter->dst.dev == down_dev ||
3848                     iter->rt6i_nh_flags & RTNH_F_DEAD)
3849                         dead++;
3850
3851         return dead;
3852 }
3853
3854 static void rt6_multipath_nh_flags_set(struct rt6_info *rt,
3855                                        const struct net_device *dev,
3856                                        unsigned int nh_flags)
3857 {
3858         struct rt6_info *iter;
3859
3860         if (rt->dst.dev == dev)
3861                 rt->rt6i_nh_flags |= nh_flags;
3862         list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3863                 if (iter->dst.dev == dev)
3864                         iter->rt6i_nh_flags |= nh_flags;
3865 }
3866
3867 /* called with write lock held for table with rt */
3868 static int fib6_ifdown(struct rt6_info *rt, void *p_arg)
3869 {
3870         const struct arg_netdev_event *arg = p_arg;
3871         const struct net_device *dev = arg->dev;
3872         struct net *net = dev_net(dev);
3873
3874         if (rt == net->ipv6.ip6_null_entry)
3875                 return 0;
3876
3877         switch (arg->event) {
3878         case NETDEV_UNREGISTER:
3879                 return rt->dst.dev == dev ? -1 : 0;
3880         case NETDEV_DOWN:
3881                 if (rt->should_flush)
3882                         return -1;
3883                 if (!rt->rt6i_nsiblings)
3884                         return rt->dst.dev == dev ? -1 : 0;
3885                 if (rt6_multipath_uses_dev(rt, dev)) {
3886                         unsigned int count;
3887
3888                         count = rt6_multipath_dead_count(rt, dev);
3889                         if (rt->rt6i_nsiblings + 1 == count) {
3890                                 rt6_multipath_flush(rt);
3891                                 return -1;
3892                         }
3893                         rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
3894                                                    RTNH_F_LINKDOWN);
3895                         fib6_update_sernum(net, rt);
3896                         rt6_multipath_rebalance(rt);
3897                 }
3898                 return -2;
3899         case NETDEV_CHANGE:
3900                 if (rt->dst.dev != dev ||
3901                     rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST))
3902                         break;
3903                 rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
3904                 rt6_multipath_rebalance(rt);
3905                 break;
3906         }
3907
3908         return 0;
3909 }
3910
3911 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
3912 {
3913         struct arg_netdev_event arg = {
3914                 .dev = dev,
3915                 {
3916                         .event = event,
3917                 },
3918         };
3919
3920         fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
3921 }
3922
3923 void rt6_disable_ip(struct net_device *dev, unsigned long event)
3924 {
3925         rt6_sync_down_dev(dev, event);
3926         rt6_uncached_list_flush_dev(dev_net(dev), dev);
3927         neigh_ifdown(&nd_tbl, dev);
3928 }
3929
3930 struct rt6_mtu_change_arg {
3931         struct net_device *dev;
3932         unsigned int mtu;
3933 };
3934
3935 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
3936 {
3937         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
3938         struct inet6_dev *idev;
3939
3940         /* In IPv6 pmtu discovery is not optional,
3941            so that RTAX_MTU lock cannot disable it.
3942            We still use this lock to block changes
3943            caused by addrconf/ndisc.
3944         */
3945
3946         idev = __in6_dev_get(arg->dev);
3947         if (!idev)
3948                 return 0;
3949
3950         /* For administrative MTU increase, there is no way to discover
3951            IPv6 PMTU increase, so PMTU increase should be updated here.
3952            Since RFC 1981 doesn't include administrative MTU increase
3953            update PMTU increase is a MUST. (i.e. jumbo frame)
3954          */
3955         if (rt->dst.dev == arg->dev &&
3956             !dst_metric_locked(&rt->dst, RTAX_MTU)) {
3957                 spin_lock_bh(&rt6_exception_lock);
3958                 if (dst_metric_raw(&rt->dst, RTAX_MTU) &&
3959                     rt6_mtu_change_route_allowed(idev, rt, arg->mtu))
3960                         dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
3961                 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
3962                 spin_unlock_bh(&rt6_exception_lock);
3963         }
3964         return 0;
3965 }
3966
3967 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
3968 {
3969         struct rt6_mtu_change_arg arg = {
3970                 .dev = dev,
3971                 .mtu = mtu,
3972         };
3973
3974         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
3975 }
3976
3977 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
3978         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
3979         [RTA_OIF]               = { .type = NLA_U32 },
3980         [RTA_IIF]               = { .type = NLA_U32 },
3981         [RTA_PRIORITY]          = { .type = NLA_U32 },
3982         [RTA_METRICS]           = { .type = NLA_NESTED },
3983         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
3984         [RTA_PREF]              = { .type = NLA_U8 },
3985         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
3986         [RTA_ENCAP]             = { .type = NLA_NESTED },
3987         [RTA_EXPIRES]           = { .type = NLA_U32 },
3988         [RTA_UID]               = { .type = NLA_U32 },
3989         [RTA_MARK]              = { .type = NLA_U32 },
3990 };
3991
3992 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
3993                               struct fib6_config *cfg,
3994                               struct netlink_ext_ack *extack)
3995 {
3996         struct rtmsg *rtm;
3997         struct nlattr *tb[RTA_MAX+1];
3998         unsigned int pref;
3999         int err;
4000
4001         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4002                           NULL);
4003         if (err < 0)
4004                 goto errout;
4005
4006         err = -EINVAL;
4007         rtm = nlmsg_data(nlh);
4008         memset(cfg, 0, sizeof(*cfg));
4009
4010         cfg->fc_table = rtm->rtm_table;
4011         cfg->fc_dst_len = rtm->rtm_dst_len;
4012         cfg->fc_src_len = rtm->rtm_src_len;
4013         cfg->fc_flags = RTF_UP;
4014         cfg->fc_protocol = rtm->rtm_protocol;
4015         cfg->fc_type = rtm->rtm_type;
4016
4017         if (rtm->rtm_type == RTN_UNREACHABLE ||
4018             rtm->rtm_type == RTN_BLACKHOLE ||
4019             rtm->rtm_type == RTN_PROHIBIT ||
4020             rtm->rtm_type == RTN_THROW)
4021                 cfg->fc_flags |= RTF_REJECT;
4022
4023         if (rtm->rtm_type == RTN_LOCAL)
4024                 cfg->fc_flags |= RTF_LOCAL;
4025
4026         if (rtm->rtm_flags & RTM_F_CLONED)
4027                 cfg->fc_flags |= RTF_CACHE;
4028
4029         cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4030
4031         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
4032         cfg->fc_nlinfo.nlh = nlh;
4033         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
4034
4035         if (tb[RTA_GATEWAY]) {
4036                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4037                 cfg->fc_flags |= RTF_GATEWAY;
4038         }
4039
4040         if (tb[RTA_DST]) {
4041                 int plen = (rtm->rtm_dst_len + 7) >> 3;
4042
4043                 if (nla_len(tb[RTA_DST]) < plen)
4044                         goto errout;
4045
4046                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4047         }
4048
4049         if (tb[RTA_SRC]) {
4050                 int plen = (rtm->rtm_src_len + 7) >> 3;
4051
4052                 if (nla_len(tb[RTA_SRC]) < plen)
4053                         goto errout;
4054
4055                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4056         }
4057
4058         if (tb[RTA_PREFSRC])
4059                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4060
4061         if (tb[RTA_OIF])
4062                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4063
4064         if (tb[RTA_PRIORITY])
4065                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4066
4067         if (tb[RTA_METRICS]) {
4068                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4069                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4070         }
4071
4072         if (tb[RTA_TABLE])
4073                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4074
4075         if (tb[RTA_MULTIPATH]) {
4076                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4077                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4078
4079                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4080                                                      cfg->fc_mp_len, extack);
4081                 if (err < 0)
4082                         goto errout;
4083         }
4084
4085         if (tb[RTA_PREF]) {
4086                 pref = nla_get_u8(tb[RTA_PREF]);
4087                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4088                     pref != ICMPV6_ROUTER_PREF_HIGH)
4089                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
4090                 cfg->fc_flags |= RTF_PREF(pref);
4091         }
4092
4093         if (tb[RTA_ENCAP])
4094                 cfg->fc_encap = tb[RTA_ENCAP];
4095
4096         if (tb[RTA_ENCAP_TYPE]) {
4097                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4098
4099                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4100                 if (err < 0)
4101                         goto errout;
4102         }
4103
4104         if (tb[RTA_EXPIRES]) {
4105                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4106
4107                 if (addrconf_finite_timeout(timeout)) {
4108                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4109                         cfg->fc_flags |= RTF_EXPIRES;
4110                 }
4111         }
4112
4113         err = 0;
4114 errout:
4115         return err;
4116 }
4117
4118 struct rt6_nh {
4119         struct rt6_info *rt6_info;
4120         struct fib6_config r_cfg;
4121         struct mx6_config mxc;
4122         struct list_head next;
4123 };
4124
4125 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4126 {
4127         struct rt6_nh *nh;
4128
4129         list_for_each_entry(nh, rt6_nh_list, next) {
4130                 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4131                         &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4132                         nh->r_cfg.fc_ifindex);
4133         }
4134 }
4135
4136 static int ip6_route_info_append(struct list_head *rt6_nh_list,
4137                                  struct rt6_info *rt, struct fib6_config *r_cfg)
4138 {
4139         struct rt6_nh *nh;
4140         int err = -EEXIST;
4141
4142         list_for_each_entry(nh, rt6_nh_list, next) {
4143                 /* check if rt6_info already exists */
4144                 if (rt6_duplicate_nexthop(nh->rt6_info, rt))
4145                         return err;
4146         }
4147
4148         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4149         if (!nh)
4150                 return -ENOMEM;
4151         nh->rt6_info = rt;
4152         err = ip6_convert_metrics(&nh->mxc, r_cfg);
4153         if (err) {
4154                 kfree(nh);
4155                 return err;
4156         }
4157         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4158         list_add_tail(&nh->next, rt6_nh_list);
4159
4160         return 0;
4161 }
4162
4163 static void ip6_route_mpath_notify(struct rt6_info *rt,
4164                                    struct rt6_info *rt_last,
4165                                    struct nl_info *info,
4166                                    __u16 nlflags)
4167 {
4168         /* if this is an APPEND route, then rt points to the first route
4169          * inserted and rt_last points to last route inserted. Userspace
4170          * wants a consistent dump of the route which starts at the first
4171          * nexthop. Since sibling routes are always added at the end of
4172          * the list, find the first sibling of the last route appended
4173          */
4174         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
4175                 rt = list_first_entry(&rt_last->rt6i_siblings,
4176                                       struct rt6_info,
4177                                       rt6i_siblings);
4178         }
4179
4180         if (rt)
4181                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4182 }
4183
4184 static int ip6_route_multipath_add(struct fib6_config *cfg,
4185                                    struct netlink_ext_ack *extack)
4186 {
4187         struct rt6_info *rt_notif = NULL, *rt_last = NULL;
4188         struct nl_info *info = &cfg->fc_nlinfo;
4189         struct fib6_config r_cfg;
4190         struct rtnexthop *rtnh;
4191         struct rt6_info *rt;
4192         struct rt6_nh *err_nh;
4193         struct rt6_nh *nh, *nh_safe;
4194         __u16 nlflags;
4195         int remaining;
4196         int attrlen;
4197         int err = 1;
4198         int nhn = 0;
4199         int replace = (cfg->fc_nlinfo.nlh &&
4200                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4201         LIST_HEAD(rt6_nh_list);
4202
4203         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4204         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4205                 nlflags |= NLM_F_APPEND;
4206
4207         remaining = cfg->fc_mp_len;
4208         rtnh = (struct rtnexthop *)cfg->fc_mp;
4209
4210         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4211          * rt6_info structs per nexthop
4212          */
4213         while (rtnh_ok(rtnh, remaining)) {
4214                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4215                 if (rtnh->rtnh_ifindex)
4216                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4217
4218                 attrlen = rtnh_attrlen(rtnh);
4219                 if (attrlen > 0) {
4220                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4221
4222                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4223                         if (nla) {
4224                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4225                                 r_cfg.fc_flags |= RTF_GATEWAY;
4226                         }
4227                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4228                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4229                         if (nla)
4230                                 r_cfg.fc_encap_type = nla_get_u16(nla);
4231                 }
4232
4233                 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4234                 rt = ip6_route_info_create(&r_cfg, extack);
4235                 if (IS_ERR(rt)) {
4236                         err = PTR_ERR(rt);
4237                         rt = NULL;
4238                         goto cleanup;
4239                 }
4240
4241                 rt->rt6i_nh_weight = rtnh->rtnh_hops + 1;
4242
4243                 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
4244                 if (err) {
4245                         dst_release_immediate(&rt->dst);
4246                         goto cleanup;
4247                 }
4248
4249                 rtnh = rtnh_next(rtnh, &remaining);
4250         }
4251
4252         /* for add and replace send one notification with all nexthops.
4253          * Skip the notification in fib6_add_rt2node and send one with
4254          * the full route when done
4255          */
4256         info->skip_notify = 1;
4257
4258         err_nh = NULL;
4259         list_for_each_entry(nh, &rt6_nh_list, next) {
4260                 rt_last = nh->rt6_info;
4261                 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
4262                 /* save reference to first route for notification */
4263                 if (!rt_notif && !err)
4264                         rt_notif = nh->rt6_info;
4265
4266                 /* nh->rt6_info is used or freed at this point, reset to NULL*/
4267                 nh->rt6_info = NULL;
4268                 if (err) {
4269                         if (replace && nhn)
4270                                 ip6_print_replace_route_err(&rt6_nh_list);
4271                         err_nh = nh;
4272                         goto add_errout;
4273                 }
4274
4275                 /* Because each route is added like a single route we remove
4276                  * these flags after the first nexthop: if there is a collision,
4277                  * we have already failed to add the first nexthop:
4278                  * fib6_add_rt2node() has rejected it; when replacing, old
4279                  * nexthops have been replaced by first new, the rest should
4280                  * be added to it.
4281                  */
4282                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4283                                                      NLM_F_REPLACE);
4284                 nhn++;
4285         }
4286
4287         /* success ... tell user about new route */
4288         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4289         goto cleanup;
4290
4291 add_errout:
4292         /* send notification for routes that were added so that
4293          * the delete notifications sent by ip6_route_del are
4294          * coherent
4295          */
4296         if (rt_notif)
4297                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4298
4299         /* Delete routes that were already added */
4300         list_for_each_entry(nh, &rt6_nh_list, next) {
4301                 if (err_nh == nh)
4302                         break;
4303                 ip6_route_del(&nh->r_cfg, extack);
4304         }
4305
4306 cleanup:
4307         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4308                 if (nh->rt6_info)
4309                         dst_release_immediate(&nh->rt6_info->dst);
4310                 kfree(nh->mxc.mx);
4311                 list_del(&nh->next);
4312                 kfree(nh);
4313         }
4314
4315         return err;
4316 }
4317
4318 static int ip6_route_multipath_del(struct fib6_config *cfg,
4319                                    struct netlink_ext_ack *extack)
4320 {
4321         struct fib6_config r_cfg;
4322         struct rtnexthop *rtnh;
4323         int remaining;
4324         int attrlen;
4325         int err = 1, last_err = 0;
4326
4327         remaining = cfg->fc_mp_len;
4328         rtnh = (struct rtnexthop *)cfg->fc_mp;
4329
4330         /* Parse a Multipath Entry */
4331         while (rtnh_ok(rtnh, remaining)) {
4332                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4333                 if (rtnh->rtnh_ifindex)
4334                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4335
4336                 attrlen = rtnh_attrlen(rtnh);
4337                 if (attrlen > 0) {
4338                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4339
4340                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4341                         if (nla) {
4342                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4343                                 r_cfg.fc_flags |= RTF_GATEWAY;
4344                         }
4345                 }
4346                 err = ip6_route_del(&r_cfg, extack);
4347                 if (err)
4348                         last_err = err;
4349
4350                 rtnh = rtnh_next(rtnh, &remaining);
4351         }
4352
4353         return last_err;
4354 }
4355
4356 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4357                               struct netlink_ext_ack *extack)
4358 {
4359         struct fib6_config cfg;
4360         int err;
4361
4362         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4363         if (err < 0)
4364                 return err;
4365
4366         if (cfg.fc_mp)
4367                 return ip6_route_multipath_del(&cfg, extack);
4368         else {
4369                 cfg.fc_delete_all_nh = 1;
4370                 return ip6_route_del(&cfg, extack);
4371         }
4372 }
4373
4374 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4375                               struct netlink_ext_ack *extack)
4376 {
4377         struct fib6_config cfg;
4378         int err;
4379
4380         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4381         if (err < 0)
4382                 return err;
4383
4384         if (cfg.fc_mp)
4385                 return ip6_route_multipath_add(&cfg, extack);
4386         else
4387                 return ip6_route_add(&cfg, extack);
4388 }
4389
4390 static size_t rt6_nlmsg_size(struct rt6_info *rt)
4391 {
4392         int nexthop_len = 0;
4393
4394         if (rt->rt6i_nsiblings) {
4395                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
4396                             + NLA_ALIGN(sizeof(struct rtnexthop))
4397                             + nla_total_size(16) /* RTA_GATEWAY */
4398                             + lwtunnel_get_encap_size(rt->dst.lwtstate);
4399
4400                 nexthop_len *= rt->rt6i_nsiblings;
4401         }
4402
4403         return NLMSG_ALIGN(sizeof(struct rtmsg))
4404                + nla_total_size(16) /* RTA_SRC */
4405                + nla_total_size(16) /* RTA_DST */
4406                + nla_total_size(16) /* RTA_GATEWAY */
4407                + nla_total_size(16) /* RTA_PREFSRC */
4408                + nla_total_size(4) /* RTA_TABLE */
4409                + nla_total_size(4) /* RTA_IIF */
4410                + nla_total_size(4) /* RTA_OIF */
4411                + nla_total_size(4) /* RTA_PRIORITY */
4412                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4413                + nla_total_size(sizeof(struct rta_cacheinfo))
4414                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4415                + nla_total_size(1) /* RTA_PREF */
4416                + lwtunnel_get_encap_size(rt->dst.lwtstate)
4417                + nexthop_len;
4418 }
4419
4420 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
4421                             unsigned int *flags, bool skip_oif)
4422 {
4423         if (rt->rt6i_nh_flags & RTNH_F_DEAD)
4424                 *flags |= RTNH_F_DEAD;
4425
4426         if (rt->rt6i_nh_flags & RTNH_F_LINKDOWN) {
4427                 *flags |= RTNH_F_LINKDOWN;
4428                 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
4429                         *flags |= RTNH_F_DEAD;
4430         }
4431
4432         if (rt->rt6i_flags & RTF_GATEWAY) {
4433                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
4434                         goto nla_put_failure;
4435         }
4436
4437         *flags |= (rt->rt6i_nh_flags & RTNH_F_ONLINK);
4438         if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
4439                 *flags |= RTNH_F_OFFLOAD;
4440
4441         /* not needed for multipath encoding b/c it has a rtnexthop struct */
4442         if (!skip_oif && rt->dst.dev &&
4443             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
4444                 goto nla_put_failure;
4445
4446         if (rt->dst.lwtstate &&
4447             lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
4448                 goto nla_put_failure;
4449
4450         return 0;
4451
4452 nla_put_failure:
4453         return -EMSGSIZE;
4454 }
4455
4456 /* add multipath next hop */
4457 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
4458 {
4459         struct rtnexthop *rtnh;
4460         unsigned int flags = 0;
4461
4462         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4463         if (!rtnh)
4464                 goto nla_put_failure;
4465
4466         rtnh->rtnh_hops = rt->rt6i_nh_weight - 1;
4467         rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
4468
4469         if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4470                 goto nla_put_failure;
4471
4472         rtnh->rtnh_flags = flags;
4473
4474         /* length of rtnetlink header + attributes */
4475         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4476
4477         return 0;
4478
4479 nla_put_failure:
4480         return -EMSGSIZE;
4481 }
4482
4483 static int rt6_fill_node(struct net *net,
4484                          struct sk_buff *skb, struct rt6_info *rt,
4485                          struct in6_addr *dst, struct in6_addr *src,
4486                          int iif, int type, u32 portid, u32 seq,
4487                          unsigned int flags)
4488 {
4489         u32 metrics[RTAX_MAX];
4490         struct rtmsg *rtm;
4491         struct nlmsghdr *nlh;
4492         long expires;
4493         u32 table;
4494
4495         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4496         if (!nlh)
4497                 return -EMSGSIZE;
4498
4499         rtm = nlmsg_data(nlh);
4500         rtm->rtm_family = AF_INET6;
4501         rtm->rtm_dst_len = rt->rt6i_dst.plen;
4502         rtm->rtm_src_len = rt->rt6i_src.plen;
4503         rtm->rtm_tos = 0;
4504         if (rt->rt6i_table)
4505                 table = rt->rt6i_table->tb6_id;
4506         else
4507                 table = RT6_TABLE_UNSPEC;
4508         rtm->rtm_table = table;
4509         if (nla_put_u32(skb, RTA_TABLE, table))
4510                 goto nla_put_failure;
4511         if (rt->rt6i_flags & RTF_REJECT) {
4512                 switch (rt->dst.error) {
4513                 case -EINVAL:
4514                         rtm->rtm_type = RTN_BLACKHOLE;
4515                         break;
4516                 case -EACCES:
4517                         rtm->rtm_type = RTN_PROHIBIT;
4518                         break;
4519                 case -EAGAIN:
4520                         rtm->rtm_type = RTN_THROW;
4521                         break;
4522                 default:
4523                         rtm->rtm_type = RTN_UNREACHABLE;
4524                         break;
4525                 }
4526         }
4527         else if (rt->rt6i_flags & RTF_LOCAL)
4528                 rtm->rtm_type = RTN_LOCAL;
4529         else if (rt->rt6i_flags & RTF_ANYCAST)
4530                 rtm->rtm_type = RTN_ANYCAST;
4531         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
4532                 rtm->rtm_type = RTN_LOCAL;
4533         else
4534                 rtm->rtm_type = RTN_UNICAST;
4535         rtm->rtm_flags = 0;
4536         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4537         rtm->rtm_protocol = rt->rt6i_protocol;
4538
4539         if (rt->rt6i_flags & RTF_CACHE)
4540                 rtm->rtm_flags |= RTM_F_CLONED;
4541
4542         if (dst) {
4543                 if (nla_put_in6_addr(skb, RTA_DST, dst))
4544                         goto nla_put_failure;
4545                 rtm->rtm_dst_len = 128;
4546         } else if (rtm->rtm_dst_len)
4547                 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
4548                         goto nla_put_failure;
4549 #ifdef CONFIG_IPV6_SUBTREES
4550         if (src) {
4551                 if (nla_put_in6_addr(skb, RTA_SRC, src))
4552                         goto nla_put_failure;
4553                 rtm->rtm_src_len = 128;
4554         } else if (rtm->rtm_src_len &&
4555                    nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
4556                 goto nla_put_failure;
4557 #endif
4558         if (iif) {
4559 #ifdef CONFIG_IPV6_MROUTE
4560                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
4561                         int err = ip6mr_get_route(net, skb, rtm, portid);
4562
4563                         if (err == 0)
4564                                 return 0;
4565                         if (err < 0)
4566                                 goto nla_put_failure;
4567                 } else
4568 #endif
4569                         if (nla_put_u32(skb, RTA_IIF, iif))
4570                                 goto nla_put_failure;
4571         } else if (dst) {
4572                 struct in6_addr saddr_buf;
4573                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
4574                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4575                         goto nla_put_failure;
4576         }
4577
4578         if (rt->rt6i_prefsrc.plen) {
4579                 struct in6_addr saddr_buf;
4580                 saddr_buf = rt->rt6i_prefsrc.addr;
4581                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4582                         goto nla_put_failure;
4583         }
4584
4585         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
4586         if (rt->rt6i_pmtu)
4587                 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
4588         if (rtnetlink_put_metrics(skb, metrics) < 0)
4589                 goto nla_put_failure;
4590
4591         if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
4592                 goto nla_put_failure;
4593
4594         /* For multipath routes, walk the siblings list and add
4595          * each as a nexthop within RTA_MULTIPATH.
4596          */
4597         if (rt->rt6i_nsiblings) {
4598                 struct rt6_info *sibling, *next_sibling;
4599                 struct nlattr *mp;
4600
4601                 mp = nla_nest_start(skb, RTA_MULTIPATH);
4602                 if (!mp)
4603                         goto nla_put_failure;
4604
4605                 if (rt6_add_nexthop(skb, rt) < 0)
4606                         goto nla_put_failure;
4607
4608                 list_for_each_entry_safe(sibling, next_sibling,
4609                                          &rt->rt6i_siblings, rt6i_siblings) {
4610                         if (rt6_add_nexthop(skb, sibling) < 0)
4611                                 goto nla_put_failure;
4612                 }
4613
4614                 nla_nest_end(skb, mp);
4615         } else {
4616                 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4617                         goto nla_put_failure;
4618         }
4619
4620         expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
4621
4622         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
4623                 goto nla_put_failure;
4624
4625         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
4626                 goto nla_put_failure;
4627
4628
4629         nlmsg_end(skb, nlh);
4630         return 0;
4631
4632 nla_put_failure:
4633         nlmsg_cancel(skb, nlh);
4634         return -EMSGSIZE;
4635 }
4636
4637 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
4638 {
4639         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4640         struct net *net = arg->net;
4641
4642         if (rt == net->ipv6.ip6_null_entry)
4643                 return 0;
4644
4645         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4646                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4647
4648                 /* user wants prefix routes only */
4649                 if (rtm->rtm_flags & RTM_F_PREFIX &&
4650                     !(rt->rt6i_flags & RTF_PREFIX_RT)) {
4651                         /* success since this is not a prefix route */
4652                         return 1;
4653                 }
4654         }
4655
4656         return rt6_fill_node(net,
4657                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
4658                      NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
4659                      NLM_F_MULTI);
4660 }
4661
4662 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4663                               struct netlink_ext_ack *extack)
4664 {
4665         struct net *net = sock_net(in_skb->sk);
4666         struct nlattr *tb[RTA_MAX+1];
4667         int err, iif = 0, oif = 0;
4668         struct dst_entry *dst;
4669         struct rt6_info *rt;
4670         struct sk_buff *skb;
4671         struct rtmsg *rtm;
4672         struct flowi6 fl6;
4673         bool fibmatch;
4674
4675         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4676                           extack);
4677         if (err < 0)
4678                 goto errout;
4679
4680         err = -EINVAL;
4681         memset(&fl6, 0, sizeof(fl6));
4682         rtm = nlmsg_data(nlh);
4683         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4684         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4685
4686         if (tb[RTA_SRC]) {
4687                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4688                         goto errout;
4689
4690                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4691         }
4692
4693         if (tb[RTA_DST]) {
4694                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4695                         goto errout;
4696
4697                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4698         }
4699
4700         if (tb[RTA_IIF])
4701                 iif = nla_get_u32(tb[RTA_IIF]);
4702
4703         if (tb[RTA_OIF])
4704                 oif = nla_get_u32(tb[RTA_OIF]);
4705
4706         if (tb[RTA_MARK])
4707                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4708
4709         if (tb[RTA_UID])
4710                 fl6.flowi6_uid = make_kuid(current_user_ns(),
4711                                            nla_get_u32(tb[RTA_UID]));
4712         else
4713                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4714
4715         if (iif) {
4716                 struct net_device *dev;
4717                 int flags = 0;
4718
4719                 rcu_read_lock();
4720
4721                 dev = dev_get_by_index_rcu(net, iif);
4722                 if (!dev) {
4723                         rcu_read_unlock();
4724                         err = -ENODEV;
4725                         goto errout;
4726                 }
4727
4728                 fl6.flowi6_iif = iif;
4729
4730                 if (!ipv6_addr_any(&fl6.saddr))
4731                         flags |= RT6_LOOKUP_F_HAS_SADDR;
4732
4733                 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4734
4735                 rcu_read_unlock();
4736         } else {
4737                 fl6.flowi6_oif = oif;
4738
4739                 dst = ip6_route_output(net, NULL, &fl6);
4740         }
4741
4742
4743         rt = container_of(dst, struct rt6_info, dst);
4744         if (rt->dst.error) {
4745                 err = rt->dst.error;
4746                 ip6_rt_put(rt);
4747                 goto errout;
4748         }
4749
4750         if (rt == net->ipv6.ip6_null_entry) {
4751                 err = rt->dst.error;
4752                 ip6_rt_put(rt);
4753                 goto errout;
4754         }
4755
4756         if (fibmatch && rt->from) {
4757                 struct rt6_info *ort = rt->from;
4758
4759                 dst_hold(&ort->dst);
4760                 ip6_rt_put(rt);
4761                 rt = ort;
4762         }
4763
4764         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4765         if (!skb) {
4766                 ip6_rt_put(rt);
4767                 err = -ENOBUFS;
4768                 goto errout;
4769         }
4770
4771         skb_dst_set(skb, &rt->dst);
4772         if (fibmatch)
4773                 err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
4774                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4775                                     nlh->nlmsg_seq, 0);
4776         else
4777                 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
4778                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4779                                     nlh->nlmsg_seq, 0);
4780         if (err < 0) {
4781                 kfree_skb(skb);
4782                 goto errout;
4783         }
4784
4785         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4786 errout:
4787         return err;
4788 }
4789
4790 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
4791                      unsigned int nlm_flags)
4792 {
4793         struct sk_buff *skb;
4794         struct net *net = info->nl_net;
4795         u32 seq;
4796         int err;
4797
4798         err = -ENOBUFS;
4799         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4800
4801         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4802         if (!skb)
4803                 goto errout;
4804
4805         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
4806                                 event, info->portid, seq, nlm_flags);
4807         if (err < 0) {
4808                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4809                 WARN_ON(err == -EMSGSIZE);
4810                 kfree_skb(skb);
4811                 goto errout;
4812         }
4813         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4814                     info->nlh, gfp_any());
4815         return;
4816 errout:
4817         if (err < 0)
4818                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4819 }
4820
4821 static int ip6_route_dev_notify(struct notifier_block *this,
4822                                 unsigned long event, void *ptr)
4823 {
4824         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4825         struct net *net = dev_net(dev);
4826
4827         if (!(dev->flags & IFF_LOOPBACK))
4828                 return NOTIFY_OK;
4829
4830         if (event == NETDEV_REGISTER) {
4831                 net->ipv6.ip6_null_entry->dst.dev = dev;
4832                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4833 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4834                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4835                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4836                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4837                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4838 #endif
4839          } else if (event == NETDEV_UNREGISTER &&
4840                     dev->reg_state != NETREG_UNREGISTERED) {
4841                 /* NETDEV_UNREGISTER could be fired for multiple times by
4842                  * netdev_wait_allrefs(). Make sure we only call this once.
4843                  */
4844                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4845 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4846                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4847                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4848 #endif
4849         }
4850
4851         return NOTIFY_OK;
4852 }
4853
4854 /*
4855  *      /proc
4856  */
4857
4858 #ifdef CONFIG_PROC_FS
4859
4860 static const struct file_operations ipv6_route_proc_fops = {
4861         .open           = ipv6_route_open,
4862         .read           = seq_read,
4863         .llseek         = seq_lseek,
4864         .release        = seq_release_net,
4865 };
4866
4867 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4868 {
4869         struct net *net = (struct net *)seq->private;
4870         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4871                    net->ipv6.rt6_stats->fib_nodes,
4872                    net->ipv6.rt6_stats->fib_route_nodes,
4873                    atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
4874                    net->ipv6.rt6_stats->fib_rt_entries,
4875                    net->ipv6.rt6_stats->fib_rt_cache,
4876                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4877                    net->ipv6.rt6_stats->fib_discarded_routes);
4878
4879         return 0;
4880 }
4881
4882 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4883 {
4884         return single_open_net(inode, file, rt6_stats_seq_show);
4885 }
4886
4887 static const struct file_operations rt6_stats_seq_fops = {
4888         .open    = rt6_stats_seq_open,
4889         .read    = seq_read,
4890         .llseek  = seq_lseek,
4891         .release = single_release_net,
4892 };
4893 #endif  /* CONFIG_PROC_FS */
4894
4895 #ifdef CONFIG_SYSCTL
4896
4897 static
4898 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
4899                               void __user *buffer, size_t *lenp, loff_t *ppos)
4900 {
4901         struct net *net;
4902         int delay;
4903         if (!write)
4904                 return -EINVAL;
4905
4906         net = (struct net *)ctl->extra1;
4907         delay = net->ipv6.sysctl.flush_delay;
4908         proc_dointvec(ctl, write, buffer, lenp, ppos);
4909         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4910         return 0;
4911 }
4912
4913 struct ctl_table ipv6_route_table_template[] = {
4914         {
4915                 .procname       =       "flush",
4916                 .data           =       &init_net.ipv6.sysctl.flush_delay,
4917                 .maxlen         =       sizeof(int),
4918                 .mode           =       0200,
4919                 .proc_handler   =       ipv6_sysctl_rtcache_flush
4920         },
4921         {
4922                 .procname       =       "gc_thresh",
4923                 .data           =       &ip6_dst_ops_template.gc_thresh,
4924                 .maxlen         =       sizeof(int),
4925                 .mode           =       0644,
4926                 .proc_handler   =       proc_dointvec,
4927         },
4928         {
4929                 .procname       =       "max_size",
4930                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
4931                 .maxlen         =       sizeof(int),
4932                 .mode           =       0644,
4933                 .proc_handler   =       proc_dointvec,
4934         },
4935         {
4936                 .procname       =       "gc_min_interval",
4937                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4938                 .maxlen         =       sizeof(int),
4939                 .mode           =       0644,
4940                 .proc_handler   =       proc_dointvec_jiffies,
4941         },
4942         {
4943                 .procname       =       "gc_timeout",
4944                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
4945                 .maxlen         =       sizeof(int),
4946                 .mode           =       0644,
4947                 .proc_handler   =       proc_dointvec_jiffies,
4948         },
4949         {
4950                 .procname       =       "gc_interval",
4951                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
4952                 .maxlen         =       sizeof(int),
4953                 .mode           =       0644,
4954                 .proc_handler   =       proc_dointvec_jiffies,
4955         },
4956         {
4957                 .procname       =       "gc_elasticity",
4958                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
4959                 .maxlen         =       sizeof(int),
4960                 .mode           =       0644,
4961                 .proc_handler   =       proc_dointvec,
4962         },
4963         {
4964                 .procname       =       "mtu_expires",
4965                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
4966                 .maxlen         =       sizeof(int),
4967                 .mode           =       0644,
4968                 .proc_handler   =       proc_dointvec_jiffies,
4969         },
4970         {
4971                 .procname       =       "min_adv_mss",
4972                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
4973                 .maxlen         =       sizeof(int),
4974                 .mode           =       0644,
4975                 .proc_handler   =       proc_dointvec,
4976         },
4977         {
4978                 .procname       =       "gc_min_interval_ms",
4979                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4980                 .maxlen         =       sizeof(int),
4981                 .mode           =       0644,
4982                 .proc_handler   =       proc_dointvec_ms_jiffies,
4983         },
4984         { }
4985 };
4986
4987 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
4988 {
4989         struct ctl_table *table;
4990
4991         table = kmemdup(ipv6_route_table_template,
4992                         sizeof(ipv6_route_table_template),
4993                         GFP_KERNEL);
4994
4995         if (table) {
4996                 table[0].data = &net->ipv6.sysctl.flush_delay;
4997                 table[0].extra1 = net;
4998                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
4999                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5000                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5001                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5002                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5003                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5004                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5005                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5006                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5007
5008                 /* Don't export sysctls to unprivileged users */
5009                 if (net->user_ns != &init_user_ns)
5010                         table[0].procname = NULL;
5011         }
5012
5013         return table;
5014 }
5015 #endif
5016
5017 static int __net_init ip6_route_net_init(struct net *net)
5018 {
5019         int ret = -ENOMEM;
5020
5021         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5022                sizeof(net->ipv6.ip6_dst_ops));
5023
5024         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5025                 goto out_ip6_dst_ops;
5026
5027         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5028                                            sizeof(*net->ipv6.ip6_null_entry),
5029                                            GFP_KERNEL);
5030         if (!net->ipv6.ip6_null_entry)
5031                 goto out_ip6_dst_entries;
5032         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5033         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5034                          ip6_template_metrics, true);
5035
5036 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5037         net->ipv6.fib6_has_custom_rules = false;
5038         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5039                                                sizeof(*net->ipv6.ip6_prohibit_entry),
5040                                                GFP_KERNEL);
5041         if (!net->ipv6.ip6_prohibit_entry)
5042                 goto out_ip6_null_entry;
5043         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5044         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5045                          ip6_template_metrics, true);
5046
5047         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5048                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
5049                                                GFP_KERNEL);
5050         if (!net->ipv6.ip6_blk_hole_entry)
5051                 goto out_ip6_prohibit_entry;
5052         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5053         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5054                          ip6_template_metrics, true);
5055 #endif
5056
5057         net->ipv6.sysctl.flush_delay = 0;
5058         net->ipv6.sysctl.ip6_rt_max_size = 4096;
5059         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5060         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5061         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5062         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5063         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5064         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5065
5066         net->ipv6.ip6_rt_gc_expire = 30*HZ;
5067
5068         ret = 0;
5069 out:
5070         return ret;
5071
5072 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5073 out_ip6_prohibit_entry:
5074         kfree(net->ipv6.ip6_prohibit_entry);
5075 out_ip6_null_entry:
5076         kfree(net->ipv6.ip6_null_entry);
5077 #endif
5078 out_ip6_dst_entries:
5079         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5080 out_ip6_dst_ops:
5081         goto out;
5082 }
5083
5084 static void __net_exit ip6_route_net_exit(struct net *net)
5085 {
5086         kfree(net->ipv6.ip6_null_entry);
5087 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5088         kfree(net->ipv6.ip6_prohibit_entry);
5089         kfree(net->ipv6.ip6_blk_hole_entry);
5090 #endif
5091         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5092 }
5093
5094 static int __net_init ip6_route_net_init_late(struct net *net)
5095 {
5096 #ifdef CONFIG_PROC_FS
5097         proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
5098         proc_create("rt6_stats", 0444, net->proc_net, &rt6_stats_seq_fops);
5099 #endif
5100         return 0;
5101 }
5102
5103 static void __net_exit ip6_route_net_exit_late(struct net *net)
5104 {
5105 #ifdef CONFIG_PROC_FS
5106         remove_proc_entry("ipv6_route", net->proc_net);
5107         remove_proc_entry("rt6_stats", net->proc_net);
5108 #endif
5109 }
5110
5111 static struct pernet_operations ip6_route_net_ops = {
5112         .init = ip6_route_net_init,
5113         .exit = ip6_route_net_exit,
5114 };
5115
5116 static int __net_init ipv6_inetpeer_init(struct net *net)
5117 {
5118         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5119
5120         if (!bp)
5121                 return -ENOMEM;
5122         inet_peer_base_init(bp);
5123         net->ipv6.peers = bp;
5124         return 0;
5125 }
5126
5127 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5128 {
5129         struct inet_peer_base *bp = net->ipv6.peers;
5130
5131         net->ipv6.peers = NULL;
5132         inetpeer_invalidate_tree(bp);
5133         kfree(bp);
5134 }
5135
5136 static struct pernet_operations ipv6_inetpeer_ops = {
5137         .init   =       ipv6_inetpeer_init,
5138         .exit   =       ipv6_inetpeer_exit,
5139 };
5140
5141 static struct pernet_operations ip6_route_net_late_ops = {
5142         .init = ip6_route_net_init_late,
5143         .exit = ip6_route_net_exit_late,
5144 };
5145
5146 static struct notifier_block ip6_route_dev_notifier = {
5147         .notifier_call = ip6_route_dev_notify,
5148         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5149 };
5150
5151 void __init ip6_route_init_special_entries(void)
5152 {
5153         /* Registering of the loopback is done before this portion of code,
5154          * the loopback reference in rt6_info will not be taken, do it
5155          * manually for init_net */
5156         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5157         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5158   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5159         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5160         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5161         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5162         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5163   #endif
5164 }
5165
5166 int __init ip6_route_init(void)
5167 {
5168         int ret;
5169         int cpu;
5170
5171         ret = -ENOMEM;
5172         ip6_dst_ops_template.kmem_cachep =
5173                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5174                                   SLAB_HWCACHE_ALIGN, NULL);
5175         if (!ip6_dst_ops_template.kmem_cachep)
5176                 goto out;
5177
5178         ret = dst_entries_init(&ip6_dst_blackhole_ops);
5179         if (ret)
5180                 goto out_kmem_cache;
5181
5182         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5183         if (ret)
5184                 goto out_dst_entries;
5185
5186         ret = register_pernet_subsys(&ip6_route_net_ops);
5187         if (ret)
5188                 goto out_register_inetpeer;
5189
5190         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5191
5192         ret = fib6_init();
5193         if (ret)
5194                 goto out_register_subsys;
5195
5196         ret = xfrm6_init();
5197         if (ret)
5198                 goto out_fib6_init;
5199
5200         ret = fib6_rules_init();
5201         if (ret)
5202                 goto xfrm6_init;
5203
5204         ret = register_pernet_subsys(&ip6_route_net_late_ops);
5205         if (ret)
5206                 goto fib6_rules_init;
5207
5208         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5209                                    inet6_rtm_newroute, NULL, 0);
5210         if (ret < 0)
5211                 goto out_register_late_subsys;
5212
5213         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5214                                    inet6_rtm_delroute, NULL, 0);
5215         if (ret < 0)
5216                 goto out_register_late_subsys;
5217
5218         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5219                                    inet6_rtm_getroute, NULL,
5220                                    RTNL_FLAG_DOIT_UNLOCKED);
5221         if (ret < 0)
5222                 goto out_register_late_subsys;
5223
5224         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5225         if (ret)
5226                 goto out_register_late_subsys;
5227
5228         for_each_possible_cpu(cpu) {
5229                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5230
5231                 INIT_LIST_HEAD(&ul->head);
5232                 spin_lock_init(&ul->lock);
5233         }
5234
5235 out:
5236         return ret;
5237
5238 out_register_late_subsys:
5239         rtnl_unregister_all(PF_INET6);
5240         unregister_pernet_subsys(&ip6_route_net_late_ops);
5241 fib6_rules_init:
5242         fib6_rules_cleanup();
5243 xfrm6_init:
5244         xfrm6_fini();
5245 out_fib6_init:
5246         fib6_gc_cleanup();
5247 out_register_subsys:
5248         unregister_pernet_subsys(&ip6_route_net_ops);
5249 out_register_inetpeer:
5250         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5251 out_dst_entries:
5252         dst_entries_destroy(&ip6_dst_blackhole_ops);
5253 out_kmem_cache:
5254         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5255         goto out;
5256 }
5257
5258 void ip6_route_cleanup(void)
5259 {
5260         unregister_netdevice_notifier(&ip6_route_dev_notifier);
5261         unregister_pernet_subsys(&ip6_route_net_late_ops);
5262         fib6_rules_cleanup();
5263         xfrm6_fini();
5264         fib6_gc_cleanup();
5265         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5266         unregister_pernet_subsys(&ip6_route_net_ops);
5267         dst_entries_destroy(&ip6_dst_blackhole_ops);
5268         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5269 }