net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/string.h>
  32 #include <linux/socket.h>
  33 #include <linux/net.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/if_arp.h>
  36 #include <linux/in6.h>
  37 #include <linux/tcp.h>
  38 #include <linux/route.h>
  39 #include <linux/module.h>
  40 #include <linux/slab.h>
  41
  42 #include <linux/bpf-cgroup.h>
  43 #include <linux/netfilter.h>
  44 #include <linux/netfilter_ipv6.h>
  45
  46 #include <net/sock.h>
  47 #include <net/snmp.h>
  48
  49 #include <net/ipv6.h>
  50 #include <net/ndisc.h>
  51 #include <net/protocol.h>
  52 #include <net/ip6_route.h>
  53 #include <net/addrconf.h>
  54 #include <net/rawv6.h>
  55 #include <net/icmp.h>
  56 #include <net/xfrm.h>
  57 #include <net/checksum.h>
  58 #include <linux/mroute6.h>
  59 #include <net/l3mdev.h>
  60 #include <net/lwtunnel.h>
  61
  62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
  63 {
  64         struct dst_entry *dst = skb_dst(skb);
  65         struct net_device *dev = dst->dev;
  66         struct neighbour *neigh;
  67         struct in6_addr *nexthop;
  68         int ret;
  69
  70         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
  71                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
  72
  73                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
  74                     ((mroute6_is_socket(net, skb) &&
  75                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
  76                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
  77                                          &ipv6_hdr(skb)->saddr))) {
  78                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
  79
  80                         /* Do not check for IFF_ALLMULTI; multicast routing
  81                            is not supported in any case.
  82                          */
  83                         if (newskb)
  84                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
  85                                         net, sk, newskb, NULL, newskb->dev,
  86                                         dev_loopback_xmit);
  87
  88                         if (ipv6_hdr(skb)->hop_limit == 0) {
  89                                 IP6_INC_STATS(net, idev,
  90                                               IPSTATS_MIB_OUTDISCARDS);
  91                                 kfree_skb(skb);
  92                                 return 0;
  93                         }
  94                 }
  95
  96                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
  97
  98                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
  99                     IPV6_ADDR_SCOPE_NODELOCAL &&
 100                     !(dev->flags & IFF_LOOPBACK)) {
 101                         kfree_skb(skb);
 102                         return 0;
 103                 }
 104         }
 105
 106         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
 107                 int res = lwtunnel_xmit(skb);
 108
 109                 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
 110                         return res;
 111         }
 112
 113         rcu_read_lock_bh();
 114         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
 115         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
 116         if (unlikely(!neigh))
 117                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
 118         if (!IS_ERR(neigh)) {
 119                 sock_confirm_neigh(skb, neigh);
 120                 ret = neigh_output(neigh, skb);
 121                 rcu_read_unlock_bh();
 122                 return ret;
 123         }
 124         rcu_read_unlock_bh();
 125
 126         IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 127         kfree_skb(skb);
 128         return -EINVAL;
 129 }
 130
 131 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 132 {
 133         int ret;
 134
 135         ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
 136         if (ret) {
 137                 kfree_skb(skb);
 138                 return ret;
 139         }
 140
 141 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 142         /* Policy lookup after SNAT yielded a new policy */
 143         if (skb_dst(skb)->xfrm) {
 144                 IPCB(skb)->flags |= IPSKB_REROUTED;
 145                 return dst_output(net, sk, skb);
 146         }
 147 #endif
 148
 149         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 150             dst_allfrag(skb_dst(skb)) ||
 151             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
 152                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
 153         else
 154                 return ip6_finish_output2(net, sk, skb);
 155 }
 156
 157 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 158 {
 159         struct net_device *dev = skb_dst(skb)->dev;
 160         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 161
 162         skb->protocol = htons(ETH_P_IPV6);
 163         skb->dev = dev;
 164
 165         if (unlikely(idev->cnf.disable_ipv6)) {
 166                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 167                 kfree_skb(skb);
 168                 return 0;
 169         }
 170
 171         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 172                             net, sk, skb, NULL, dev,
 173                             ip6_finish_output,
 174                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 175 }
 176
 177 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
 178 {
 179         if (!np->autoflowlabel_set)
 180                 return ip6_default_np_autolabel(net);
 181         else
 182                 return np->autoflowlabel;
 183 }
 184
 185 /*
 186  * xmit an sk_buff (used by TCP, SCTP and DCCP)
 187  * Note : socket lock is not held for SYNACK packets, but might be modified
 188  * by calls to skb_set_owner_w() and ipv6_local_error(),
 189  * which are using proper atomic operations or spinlocks.
 190  */
 191 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 192              __u32 mark, struct ipv6_txoptions *opt, int tclass)
 193 {
 194         struct net *net = sock_net(sk);
 195         const struct ipv6_pinfo *np = inet6_sk(sk);
 196         struct in6_addr *first_hop = &fl6->daddr;
 197         struct dst_entry *dst = skb_dst(skb);
 198         struct ipv6hdr *hdr;
 199         u8  proto = fl6->flowi6_proto;
 200         int seg_len = skb->len;
 201         int hlimit = -1;
 202         u32 mtu;
 203
 204         if (opt) {
 205                 unsigned int head_room;
 206
 207                 /* First: exthdrs may take lots of space (~8K for now)
 208                    MAX_HEADER is not enough.
 209                  */
 210                 head_room = opt->opt_nflen + opt->opt_flen;
 211                 seg_len += head_room;
 212                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 213
 214                 if (skb_headroom(skb) < head_room) {
 215                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 216                         if (!skb2) {
 217                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 218                                               IPSTATS_MIB_OUTDISCARDS);
 219                                 kfree_skb(skb);
 220                                 return -ENOBUFS;
 221                         }
 222                         consume_skb(skb);
 223                         skb = skb2;
 224                         /* skb_set_owner_w() changes sk->sk_wmem_alloc atomically,
 225                          * it is safe to call in our context (socket lock not held)
 226                          */
 227                         skb_set_owner_w(skb, (struct sock *)sk);
 228                 }
 229                 if (opt->opt_flen)
 230                         ipv6_push_frag_opts(skb, opt, &proto);
 231                 if (opt->opt_nflen)
 232                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
 233                                              &fl6->saddr);
 234         }
 235
 236         skb_push(skb, sizeof(struct ipv6hdr));
 237         skb_reset_network_header(skb);
 238         hdr = ipv6_hdr(skb);
 239
 240         /*
 241          *      Fill in the IPv6 header
 242          */
 243         if (np)
 244                 hlimit = np->hop_limit;
 245         if (hlimit < 0)
 246                 hlimit = ip6_dst_hoplimit(dst);
 247
 248         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
 249                                 ip6_autoflowlabel(net, np), fl6));
 250
 251         hdr->payload_len = htons(seg_len);
 252         hdr->nexthdr = proto;
 253         hdr->hop_limit = hlimit;
 254
 255         hdr->saddr = fl6->saddr;
 256         hdr->daddr = *first_hop;
 257
 258         skb->protocol = htons(ETH_P_IPV6);
 259         skb->priority = sk->sk_priority;
 260         skb->mark = mark;
 261
 262         mtu = dst_mtu(dst);
 263         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
 264                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 265                               IPSTATS_MIB_OUT, skb->len);
 266
 267                 /* if egress device is enslaved to an L3 master device pass the
 268                  * skb to its handler for processing
 269                  */
 270                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
 271                 if (unlikely(!skb))
 272                         return 0;
 273
 274                 /* hooks should never assume socket lock is held.
 275                  * we promote our socket to non const
 276                  */
 277                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
 278                                net, (struct sock *)sk, skb, NULL, dst->dev,
 279                                dst_output);
 280         }
 281
 282         skb->dev = dst->dev;
 283         /* ipv6_local_error() does not require socket lock,
 284          * we promote our socket to non const
 285          */
 286         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
 287
 288         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 289         kfree_skb(skb);
 290         return -EMSGSIZE;
 291 }
 292 EXPORT_SYMBOL(ip6_xmit);
 293
 294 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 295 {
 296         struct ip6_ra_chain *ra;
 297         struct sock *last = NULL;
 298
 299         read_lock(&ip6_ra_lock);
 300         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 301                 struct sock *sk = ra->sk;
 302                 if (sk && ra->sel == sel &&
 303                     (!sk->sk_bound_dev_if ||
 304                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 305                         if (last) {
 306                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 307                                 if (skb2)
 308                                         rawv6_rcv(last, skb2);
 309                         }
 310                         last = sk;
 311                 }
 312         }
 313
 314         if (last) {
 315                 rawv6_rcv(last, skb);
 316                 read_unlock(&ip6_ra_lock);
 317                 return 1;
 318         }
 319         read_unlock(&ip6_ra_lock);
 320         return 0;
 321 }
 322
 323 static int ip6_forward_proxy_check(struct sk_buff *skb)
 324 {
 325         struct ipv6hdr *hdr = ipv6_hdr(skb);
 326         u8 nexthdr = hdr->nexthdr;
 327         __be16 frag_off;
 328         int offset;
 329
 330         if (ipv6_ext_hdr(nexthdr)) {
 331                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 332                 if (offset < 0)
 333                         return 0;
 334         } else
 335                 offset = sizeof(struct ipv6hdr);
 336
 337         if (nexthdr == IPPROTO_ICMPV6) {
 338                 struct icmp6hdr *icmp6;
 339
 340                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 341                                          offset + 1 - skb->data)))
 342                         return 0;
 343
 344                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 345
 346                 switch (icmp6->icmp6_type) {
 347                 case NDISC_ROUTER_SOLICITATION:
 348                 case NDISC_ROUTER_ADVERTISEMENT:
 349                 case NDISC_NEIGHBOUR_SOLICITATION:
 350                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 351                 case NDISC_REDIRECT:
 352                         /* For reaction involving unicast neighbor discovery
 353                          * message destined to the proxied address, pass it to
 354                          * input function.
 355                          */
 356                         return 1;
 357                 default:
 358                         break;
 359                 }
 360         }
 361
 362         /*
 363          * The proxying router can't forward traffic sent to a link-local
 364          * address, so signal the sender and discard the packet. This
 365          * behavior is clarified by the MIPv6 specification.
 366          */
 367         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 368                 dst_link_failure(skb);
 369                 return -1;
 370         }
 371
 372         return 0;
 373 }
 374
 375 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 376                                      struct sk_buff *skb)
 377 {
 378         struct dst_entry *dst = skb_dst(skb);
 379
 380         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 381         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 382
 383         return dst_output(net, sk, skb);
 384 }
 385
 386 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 387 {
 388         if (skb->len <= mtu)
 389                 return false;
 390
 391         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
 392         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
 393                 return true;
 394
 395         if (skb->ignore_df)
 396                 return false;
 397
 398         if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
 399                 return false;
 400
 401         return true;
 402 }
 403
 404 int ip6_forward(struct sk_buff *skb)
 405 {
 406         struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
 407         struct dst_entry *dst = skb_dst(skb);
 408         struct ipv6hdr *hdr = ipv6_hdr(skb);
 409         struct inet6_skb_parm *opt = IP6CB(skb);
 410         struct net *net = dev_net(dst->dev);
 411         u32 mtu;
 412
 413         if (net->ipv6.devconf_all->forwarding == 0)
 414                 goto error;
 415
 416         if (skb->pkt_type != PACKET_HOST)
 417                 goto drop;
 418
 419         if (unlikely(skb->sk))
 420                 goto drop;
 421
 422         if (skb_warn_if_lro(skb))
 423                 goto drop;
 424
 425         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 426                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 427                 goto drop;
 428         }
 429
 430         skb_forward_csum(skb);
 431
 432         /*
 433          *      We DO NOT make any processing on
 434          *      RA packets, pushing them to user level AS IS
 435          *      without ane WARRANTY that application will be able
 436          *      to interpret them. The reason is that we
 437          *      cannot make anything clever here.
 438          *
 439          *      We are not end-node, so that if packet contains
 440          *      AH/ESP, we cannot make anything.
 441          *      Defragmentation also would be mistake, RA packets
 442          *      cannot be fragmented, because there is no warranty
 443          *      that different fragments will go along one path. --ANK
 444          */
 445         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
 446                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
 447                         return 0;
 448         }
 449
 450         /*
 451          *      check and decrement ttl
 452          */
 453         if (hdr->hop_limit <= 1) {
 454                 /* Force OUTPUT device used as source address */
 455                 skb->dev = dst->dev;
 456                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 457                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
 458
 459                 kfree_skb(skb);
 460                 return -ETIMEDOUT;
 461         }
 462
 463         /* XXX: idev->cnf.proxy_ndp? */
 464         if (net->ipv6.devconf_all->proxy_ndp &&
 465             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 466                 int proxied = ip6_forward_proxy_check(skb);
 467                 if (proxied > 0)
 468                         return ip6_input(skb);
 469                 else if (proxied < 0) {
 470                         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 471                         goto drop;
 472                 }
 473         }
 474
 475         if (!xfrm6_route_forward(skb)) {
 476                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 477                 goto drop;
 478         }
 479         dst = skb_dst(skb);
 480
 481         /* IPv6 specs say nothing about it, but it is clear that we cannot
 482            send redirects to source routed frames.
 483            We don't send redirects to frames decapsulated from IPsec.
 484          */
 485         if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
 486                 struct in6_addr *target = NULL;
 487                 struct inet_peer *peer;
 488                 struct rt6_info *rt;
 489
 490                 /*
 491                  *      incoming and outgoing devices are the same
 492                  *      send a redirect.
 493                  */
 494
 495                 rt = (struct rt6_info *) dst;
 496                 if (rt->rt6i_flags & RTF_GATEWAY)
 497                         target = &rt->rt6i_gateway;
 498                 else
 499                         target = &hdr->daddr;
 500
 501                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
 502
 503                 /* Limit redirects both by destination (here)
 504                    and by source (inside ndisc_send_redirect)
 505                  */
 506                 if (inet_peer_xrlim_allow(peer, 1*HZ))
 507                         ndisc_send_redirect(skb, target);
 508                 if (peer)
 509                         inet_putpeer(peer);
 510         } else {
 511                 int addrtype = ipv6_addr_type(&hdr->saddr);
 512
 513                 /* This check is security critical. */
 514                 if (addrtype == IPV6_ADDR_ANY ||
 515                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 516                         goto error;
 517                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 518                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 519                                     ICMPV6_NOT_NEIGHBOUR, 0);
 520                         goto error;
 521                 }
 522         }
 523
 524         mtu = ip6_dst_mtu_forward(dst);
 525         if (mtu < IPV6_MIN_MTU)
 526                 mtu = IPV6_MIN_MTU;
 527
 528         if (ip6_pkt_too_big(skb, mtu)) {
 529                 /* Again, force OUTPUT device used as source address */
 530                 skb->dev = dst->dev;
 531                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 532                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
 533                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 534                                 IPSTATS_MIB_FRAGFAILS);
 535                 kfree_skb(skb);
 536                 return -EMSGSIZE;
 537         }
 538
 539         if (skb_cow(skb, dst->dev->hard_header_len)) {
 540                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 541                                 IPSTATS_MIB_OUTDISCARDS);
 542                 goto drop;
 543         }
 544
 545         hdr = ipv6_hdr(skb);
 546
 547         /* Mangling hops number delayed to point after skb COW */
 548
 549         hdr->hop_limit--;
 550
 551         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
 552                        net, NULL, skb, skb->dev, dst->dev,
 553                        ip6_forward_finish);
 554
 555 error:
 556         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
 557 drop:
 558         kfree_skb(skb);
 559         return -EINVAL;
 560 }
 561
 562 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 563 {
 564         to->pkt_type = from->pkt_type;
 565         to->priority = from->priority;
 566         to->protocol = from->protocol;
 567         skb_dst_drop(to);
 568         skb_dst_set(to, dst_clone(skb_dst(from)));
 569         to->dev = from->dev;
 570         to->mark = from->mark;
 571
 572 #ifdef CONFIG_NET_SCHED
 573         to->tc_index = from->tc_index;
 574 #endif
 575         nf_copy(to, from);
 576         skb_copy_secmark(to, from);
 577 }
 578
 579 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 580                  int (*output)(struct net *, struct sock *, struct sk_buff *))
 581 {
 582         struct sk_buff *frag;
 583         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
 584         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
 585                                 inet6_sk(skb->sk) : NULL;
 586         struct ipv6hdr *tmp_hdr;
 587         struct frag_hdr *fh;
 588         unsigned int mtu, hlen, left, len;
 589         int hroom, troom;
 590         __be32 frag_id;
 591         int ptr, offset = 0, err = 0;
 592         u8 *prevhdr, nexthdr = 0;
 593
 594         err = ip6_find_1stfragopt(skb, &prevhdr);
 595         if (err < 0)
 596                 goto fail;
 597         hlen = err;
 598         nexthdr = *prevhdr;
 599
 600         mtu = ip6_skb_dst_mtu(skb);
 601
 602         /* We must not fragment if the socket is set to force MTU discovery
 603          * or if the skb it not generated by a local socket.
 604          */
 605         if (unlikely(!skb->ignore_df && skb->len > mtu))
 606                 goto fail_toobig;
 607
 608         if (IP6CB(skb)->frag_max_size) {
 609                 if (IP6CB(skb)->frag_max_size > mtu)
 610                         goto fail_toobig;
 611
 612                 /* don't send fragments larger than what we received */
 613                 mtu = IP6CB(skb)->frag_max_size;
 614                 if (mtu < IPV6_MIN_MTU)
 615                         mtu = IPV6_MIN_MTU;
 616         }
 617
 618         if (np && np->frag_size < mtu) {
 619                 if (np->frag_size)
 620                         mtu = np->frag_size;
 621         }
 622         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
 623                 goto fail_toobig;
 624         mtu -= hlen + sizeof(struct frag_hdr);
 625
 626         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
 627                                     &ipv6_hdr(skb)->saddr);
 628
 629         if (skb->ip_summed == CHECKSUM_PARTIAL &&
 630             (err = skb_checksum_help(skb)))
 631                 goto fail;
 632
 633         hroom = LL_RESERVED_SPACE(rt->dst.dev);
 634         if (skb_has_frag_list(skb)) {
 635                 unsigned int first_len = skb_pagelen(skb);
 636                 struct sk_buff *frag2;
 637
 638                 if (first_len - hlen > mtu ||
 639                     ((first_len - hlen) & 7) ||
 640                     skb_cloned(skb) ||
 641                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
 642                         goto slow_path;
 643
 644                 skb_walk_frags(skb, frag) {
 645                         /* Correct geometry. */
 646                         if (frag->len > mtu ||
 647                             ((frag->len & 7) && frag->next) ||
 648                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
 649                                 goto slow_path_clean;
 650
 651                         /* Partially cloned skb? */
 652                         if (skb_shared(frag))
 653                                 goto slow_path_clean;
 654
 655                         BUG_ON(frag->sk);
 656                         if (skb->sk) {
 657                                 frag->sk = skb->sk;
 658                                 frag->destructor = sock_wfree;
 659                         }
 660                         skb->truesize -= frag->truesize;
 661                 }
 662
 663                 err = 0;
 664                 offset = 0;
 665                 /* BUILD HEADER */
 666
 667                 *prevhdr = NEXTHDR_FRAGMENT;
 668                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 669                 if (!tmp_hdr) {
 670                         err = -ENOMEM;
 671                         goto fail;
 672                 }
 673                 frag = skb_shinfo(skb)->frag_list;
 674                 skb_frag_list_init(skb);
 675
 676                 __skb_pull(skb, hlen);
 677                 fh = __skb_push(skb, sizeof(struct frag_hdr));
 678                 __skb_push(skb, hlen);
 679                 skb_reset_network_header(skb);
 680                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 681
 682                 fh->nexthdr = nexthdr;
 683                 fh->reserved = 0;
 684                 fh->frag_off = htons(IP6_MF);
 685                 fh->identification = frag_id;
 686
 687                 first_len = skb_pagelen(skb);
 688                 skb->data_len = first_len - skb_headlen(skb);
 689                 skb->len = first_len;
 690                 ipv6_hdr(skb)->payload_len = htons(first_len -
 691                                                    sizeof(struct ipv6hdr));
 692
 693                 for (;;) {
 694                         /* Prepare header of the next frame,
 695                          * before previous one went down. */
 696                         if (frag) {
 697                                 frag->ip_summed = CHECKSUM_NONE;
 698                                 skb_reset_transport_header(frag);
 699                                 fh = __skb_push(frag, sizeof(struct frag_hdr));
 700                                 __skb_push(frag, hlen);
 701                                 skb_reset_network_header(frag);
 702                                 memcpy(skb_network_header(frag), tmp_hdr,
 703                                        hlen);
 704                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 705                                 fh->nexthdr = nexthdr;
 706                                 fh->reserved = 0;
 707                                 fh->frag_off = htons(offset);
 708                                 if (frag->next)
 709                                         fh->frag_off |= htons(IP6_MF);
 710                                 fh->identification = frag_id;
 711                                 ipv6_hdr(frag)->payload_len =
 712                                                 htons(frag->len -
 713                                                       sizeof(struct ipv6hdr));
 714                                 ip6_copy_metadata(frag, skb);
 715                         }
 716
 717                         err = output(net, sk, skb);
 718                         if (!err)
 719                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 720                                               IPSTATS_MIB_FRAGCREATES);
 721
 722                         if (err || !frag)
 723                                 break;
 724
 725                         skb = frag;
 726                         frag = skb->next;
 727                         skb->next = NULL;
 728                 }
 729
 730                 kfree(tmp_hdr);
 731
 732                 if (err == 0) {
 733                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 734                                       IPSTATS_MIB_FRAGOKS);
 735                         return 0;
 736                 }
 737
 738                 kfree_skb_list(frag);
 739
 740                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 741                               IPSTATS_MIB_FRAGFAILS);
 742                 return err;
 743
 744 slow_path_clean:
 745                 skb_walk_frags(skb, frag2) {
 746                         if (frag2 == frag)
 747                                 break;
 748                         frag2->sk = NULL;
 749                         frag2->destructor = NULL;
 750                         skb->truesize += frag2->truesize;
 751                 }
 752         }
 753
 754 slow_path:
 755         left = skb->len - hlen;         /* Space per frame */
 756         ptr = hlen;                     /* Where to start from */
 757
 758         /*
 759          *      Fragment the datagram.
 760          */
 761
 762         troom = rt->dst.dev->needed_tailroom;
 763
 764         /*
 765          *      Keep copying data until we run out.
 766          */
 767         while (left > 0)        {
 768                 u8 *fragnexthdr_offset;
 769
 770                 len = left;
 771                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 772                 if (len > mtu)
 773                         len = mtu;
 774                 /* IF: we are not sending up to and including the packet end
 775                    then align the next start on an eight byte boundary */
 776                 if (len < left) {
 777                         len &= ~7;
 778                 }
 779
 780                 /* Allocate buffer */
 781                 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
 782                                  hroom + troom, GFP_ATOMIC);
 783                 if (!frag) {
 784                         err = -ENOMEM;
 785                         goto fail;
 786                 }
 787
 788                 /*
 789                  *      Set up data on packet
 790                  */
 791
 792                 ip6_copy_metadata(frag, skb);
 793                 skb_reserve(frag, hroom);
 794                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 795                 skb_reset_network_header(frag);
 796                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 797                 frag->transport_header = (frag->network_header + hlen +
 798                                           sizeof(struct frag_hdr));
 799
 800                 /*
 801                  *      Charge the memory for the fragment to any owner
 802                  *      it might possess
 803                  */
 804                 if (skb->sk)
 805                         skb_set_owner_w(frag, skb->sk);
 806
 807                 /*
 808                  *      Copy the packet header into the new buffer.
 809                  */
 810                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 811
 812                 fragnexthdr_offset = skb_network_header(frag);
 813                 fragnexthdr_offset += prevhdr - skb_network_header(skb);
 814                 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
 815
 816                 /*
 817                  *      Build fragment header.
 818                  */
 819                 fh->nexthdr = nexthdr;
 820                 fh->reserved = 0;
 821                 fh->identification = frag_id;
 822
 823                 /*
 824                  *      Copy a block of the IP datagram.
 825                  */
 826                 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
 827                                      len));
 828                 left -= len;
 829
 830                 fh->frag_off = htons(offset);
 831                 if (left > 0)
 832                         fh->frag_off |= htons(IP6_MF);
 833                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 834                                                     sizeof(struct ipv6hdr));
 835
 836                 ptr += len;
 837                 offset += len;
 838
 839                 /*
 840                  *      Put this fragment into the sending queue.
 841                  */
 842                 err = output(net, sk, frag);
 843                 if (err)
 844                         goto fail;
 845
 846                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 847                               IPSTATS_MIB_FRAGCREATES);
 848         }
 849         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 850                       IPSTATS_MIB_FRAGOKS);
 851         consume_skb(skb);
 852         return err;
 853
 854 fail_toobig:
 855         if (skb->sk && dst_allfrag(skb_dst(skb)))
 856                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
 857
 858         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 859         err = -EMSGSIZE;
 860
 861 fail:
 862         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 863                       IPSTATS_MIB_FRAGFAILS);
 864         kfree_skb(skb);
 865         return err;
 866 }
 867
 868 static inline int ip6_rt_check(const struct rt6key *rt_key,
 869                                const struct in6_addr *fl_addr,
 870                                const struct in6_addr *addr_cache)
 871 {
 872         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 873                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
 874 }
 875
 876 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 877                                           struct dst_entry *dst,
 878                                           const struct flowi6 *fl6)
 879 {
 880         struct ipv6_pinfo *np = inet6_sk(sk);
 881         struct rt6_info *rt;
 882
 883         if (!dst)
 884                 goto out;
 885
 886         if (dst->ops->family != AF_INET6) {
 887                 dst_release(dst);
 888                 return NULL;
 889         }
 890
 891         rt = (struct rt6_info *)dst;
 892         /* Yes, checking route validity in not connected
 893          * case is not very simple. Take into account,
 894          * that we do not support routing by source, TOS,
 895          * and MSG_DONTROUTE            --ANK (980726)
 896          *
 897          * 1. ip6_rt_check(): If route was host route,
 898          *    check that cached destination is current.
 899          *    If it is network route, we still may
 900          *    check its validity using saved pointer
 901          *    to the last used address: daddr_cache.
 902          *    We do not want to save whole address now,
 903          *    (because main consumer of this service
 904          *    is tcp, which has not this problem),
 905          *    so that the last trick works only on connected
 906          *    sockets.
 907          * 2. oif also should be the same.
 908          */
 909         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
 910 #ifdef CONFIG_IPV6_SUBTREES
 911             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
 912 #endif
 913            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
 914               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
 915                 dst_release(dst);
 916                 dst = NULL;
 917         }
 918
 919 out:
 920         return dst;
 921 }
 922
 923 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
 924                                struct dst_entry **dst, struct flowi6 *fl6)
 925 {
 926 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 927         struct neighbour *n;
 928         struct rt6_info *rt;
 929 #endif
 930         int err;
 931         int flags = 0;
 932
 933         /* The correct way to handle this would be to do
 934          * ip6_route_get_saddr, and then ip6_route_output; however,
 935          * the route-specific preferred source forces the
 936          * ip6_route_output call _before_ ip6_route_get_saddr.
 937          *
 938          * In source specific routing (no src=any default route),
 939          * ip6_route_output will fail given src=any saddr, though, so
 940          * that's why we try it again later.
 941          */
 942         if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
 943                 struct fib6_info *from;
 944                 struct rt6_info *rt;
 945                 bool had_dst = *dst != NULL;
 946
 947                 if (!had_dst)
 948                         *dst = ip6_route_output(net, sk, fl6);
 949                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
 950
 951                 rcu_read_lock();
 952                 from = rt ? rcu_dereference(rt->from) : NULL;
 953                 err = ip6_route_get_saddr(net, from, &fl6->daddr,
 954                                           sk ? inet6_sk(sk)->srcprefs : 0,
 955                                           &fl6->saddr);
 956                 rcu_read_unlock();
 957
 958                 if (err)
 959                         goto out_err_release;
 960
 961                 /* If we had an erroneous initial result, pretend it
 962                  * never existed and let the SA-enabled version take
 963                  * over.
 964                  */
 965                 if (!had_dst && (*dst)->error) {
 966                         dst_release(*dst);
 967                         *dst = NULL;
 968                 }
 969
 970                 if (fl6->flowi6_oif)
 971                         flags |= RT6_LOOKUP_F_IFACE;
 972         }
 973
 974         if (!*dst)
 975                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
 976
 977         err = (*dst)->error;
 978         if (err)
 979                 goto out_err_release;
 980
 981 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 982         /*
 983          * Here if the dst entry we've looked up
 984          * has a neighbour entry that is in the INCOMPLETE
 985          * state and the src address from the flow is
 986          * marked as OPTIMISTIC, we release the found
 987          * dst entry and replace it instead with the
 988          * dst entry of the nexthop router
 989          */
 990         rt = (struct rt6_info *) *dst;
 991         rcu_read_lock_bh();
 992         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
 993                                       rt6_nexthop(rt, &fl6->daddr));
 994         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
 995         rcu_read_unlock_bh();
 996
 997         if (err) {
 998                 struct inet6_ifaddr *ifp;
 999                 struct flowi6 fl_gw6;
1000                 int redirect;
1001
1002                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1003                                       (*dst)->dev, 1);
1004
1005                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1006                 if (ifp)
1007                         in6_ifa_put(ifp);
1008
1009                 if (redirect) {
1010                         /*
1011                          * We need to get the dst entry for the
1012                          * default router instead
1013                          */
1014                         dst_release(*dst);
1015                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1016                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1017                         *dst = ip6_route_output(net, sk, &fl_gw6);
1018                         err = (*dst)->error;
1019                         if (err)
1020                                 goto out_err_release;
1021                 }
1022         }
1023 #endif
1024         if (ipv6_addr_v4mapped(&fl6->saddr) &&
1025             !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1026                 err = -EAFNOSUPPORT;
1027                 goto out_err_release;
1028         }
1029
1030         return 0;
1031
1032 out_err_release:
1033         dst_release(*dst);
1034         *dst = NULL;
1035
1036         if (err == -ENETUNREACH)
1037                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1038         return err;
1039 }
1040
1041 /**
1042  *      ip6_dst_lookup - perform route lookup on flow
1043  *      @sk: socket which provides route info
1044  *      @dst: pointer to dst_entry * for result
1045  *      @fl6: flow to lookup
1046  *
1047  *      This function performs a route lookup on the given flow.
1048  *
1049  *      It returns zero on success, or a standard errno code on error.
1050  */
1051 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1052                    struct flowi6 *fl6)
1053 {
1054         *dst = NULL;
1055         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1056 }
1057 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1058
1059 /**
1060  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1061  *      @sk: socket which provides route info
1062  *      @fl6: flow to lookup
1063  *      @final_dst: final destination address for ipsec lookup
1064  *
1065  *      This function performs a route lookup on the given flow.
1066  *
1067  *      It returns a valid dst pointer on success, or a pointer encoded
1068  *      error code.
1069  */
1070 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1071                                       const struct in6_addr *final_dst)
1072 {
1073         struct dst_entry *dst = NULL;
1074         int err;
1075
1076         err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1077         if (err)
1078                 return ERR_PTR(err);
1079         if (final_dst)
1080                 fl6->daddr = *final_dst;
1081
1082         return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1083 }
1084 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1085
1086 /**
1087  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1088  *      @sk: socket which provides the dst cache and route info
1089  *      @fl6: flow to lookup
1090  *      @final_dst: final destination address for ipsec lookup
1091  *      @connected: whether @sk is connected or not
1092  *
1093  *      This function performs a route lookup on the given flow with the
1094  *      possibility of using the cached route in the socket if it is valid.
1095  *      It will take the socket dst lock when operating on the dst cache.
1096  *      As a result, this function can only be used in process context.
1097  *
1098  *      In addition, for a connected socket, cache the dst in the socket
1099  *      if the current cache is not valid.
1100  *
1101  *      It returns a valid dst pointer on success, or a pointer encoded
1102  *      error code.
1103  */
1104 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1105                                          const struct in6_addr *final_dst,
1106                                          bool connected)
1107 {
1108         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1109
1110         dst = ip6_sk_dst_check(sk, dst, fl6);
1111         if (dst)
1112                 return dst;
1113
1114         dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1115         if (connected && !IS_ERR(dst))
1116                 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1117
1118         return dst;
1119 }
1120 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1121
1122 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1123                                                gfp_t gfp)
1124 {
1125         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1126 }
1127
1128 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1129                                                 gfp_t gfp)
1130 {
1131         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1132 }
1133
1134 static void ip6_append_data_mtu(unsigned int *mtu,
1135                                 int *maxfraglen,
1136                                 unsigned int fragheaderlen,
1137                                 struct sk_buff *skb,
1138                                 struct rt6_info *rt,
1139                                 unsigned int orig_mtu)
1140 {
1141         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1142                 if (!skb) {
1143                         /* first fragment, reserve header_len */
1144                         *mtu = orig_mtu - rt->dst.header_len;
1145
1146                 } else {
1147                         /*
1148                          * this fragment is not first, the headers
1149                          * space is regarded as data space.
1150                          */
1151                         *mtu = orig_mtu;
1152                 }
1153                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1154                               + fragheaderlen - sizeof(struct frag_hdr);
1155         }
1156 }
1157
1158 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1159                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1160                           struct rt6_info *rt, struct flowi6 *fl6)
1161 {
1162         struct ipv6_pinfo *np = inet6_sk(sk);
1163         unsigned int mtu;
1164         struct ipv6_txoptions *opt = ipc6->opt;
1165
1166         /*
1167          * setup for corking
1168          */
1169         if (opt) {
1170                 if (WARN_ON(v6_cork->opt))
1171                         return -EINVAL;
1172
1173                 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1174                 if (unlikely(!v6_cork->opt))
1175                         return -ENOBUFS;
1176
1177                 v6_cork->opt->tot_len = sizeof(*opt);
1178                 v6_cork->opt->opt_flen = opt->opt_flen;
1179                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1180
1181                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1182                                                     sk->sk_allocation);
1183                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1184                         return -ENOBUFS;
1185
1186                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1187                                                     sk->sk_allocation);
1188                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1189                         return -ENOBUFS;
1190
1191                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1192                                                    sk->sk_allocation);
1193                 if (opt->hopopt && !v6_cork->opt->hopopt)
1194                         return -ENOBUFS;
1195
1196                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1197                                                     sk->sk_allocation);
1198                 if (opt->srcrt && !v6_cork->opt->srcrt)
1199                         return -ENOBUFS;
1200
1201                 /* need source address above miyazawa*/
1202         }
1203         dst_hold(&rt->dst);
1204         cork->base.dst = &rt->dst;
1205         cork->fl.u.ip6 = *fl6;
1206         v6_cork->hop_limit = ipc6->hlimit;
1207         v6_cork->tclass = ipc6->tclass;
1208         if (rt->dst.flags & DST_XFRM_TUNNEL)
1209                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1210                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1211         else
1212                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1213                         READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1214         if (np->frag_size < mtu) {
1215                 if (np->frag_size)
1216                         mtu = np->frag_size;
1217         }
1218         if (mtu < IPV6_MIN_MTU)
1219                 return -EINVAL;
1220         cork->base.fragsize = mtu;
1221         cork->base.gso_size = sk->sk_type == SOCK_DGRAM ? ipc6->gso_size : 0;
1222
1223         if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1224                 cork->base.flags |= IPCORK_ALLFRAG;
1225         cork->base.length = 0;
1226
1227         return 0;
1228 }
1229
1230 static int __ip6_append_data(struct sock *sk,
1231                              struct flowi6 *fl6,
1232                              struct sk_buff_head *queue,
1233                              struct inet_cork *cork,
1234                              struct inet6_cork *v6_cork,
1235                              struct page_frag *pfrag,
1236                              int getfrag(void *from, char *to, int offset,
1237                                          int len, int odd, struct sk_buff *skb),
1238                              void *from, int length, int transhdrlen,
1239                              unsigned int flags, struct ipcm6_cookie *ipc6,
1240                              const struct sockcm_cookie *sockc)
1241 {
1242         struct sk_buff *skb, *skb_prev = NULL;
1243         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1244         int exthdrlen = 0;
1245         int dst_exthdrlen = 0;
1246         int hh_len;
1247         int copy;
1248         int err;
1249         int offset = 0;
1250         __u8 tx_flags = 0;
1251         u32 tskey = 0;
1252         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1253         struct ipv6_txoptions *opt = v6_cork->opt;
1254         int csummode = CHECKSUM_NONE;
1255         unsigned int maxnonfragsize, headersize;
1256         unsigned int wmem_alloc_delta = 0;
1257         bool paged;
1258
1259         skb = skb_peek_tail(queue);
1260         if (!skb) {
1261                 exthdrlen = opt ? opt->opt_flen : 0;
1262                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1263         }
1264
1265         paged = !!cork->gso_size;
1266         mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1267         orig_mtu = mtu;
1268
1269         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1270
1271         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1272                         (opt ? opt->opt_nflen : 0);
1273         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1274                      sizeof(struct frag_hdr);
1275
1276         headersize = sizeof(struct ipv6hdr) +
1277                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1278                      (dst_allfrag(&rt->dst) ?
1279                       sizeof(struct frag_hdr) : 0) +
1280                      rt->rt6i_nfheader_len;
1281
1282         /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1283          * the first fragment
1284          */
1285         if (headersize + transhdrlen > mtu)
1286                 goto emsgsize;
1287
1288         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1289             (sk->sk_protocol == IPPROTO_UDP ||
1290              sk->sk_protocol == IPPROTO_RAW)) {
1291                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1292                                 sizeof(struct ipv6hdr));
1293                 goto emsgsize;
1294         }
1295
1296         if (ip6_sk_ignore_df(sk))
1297                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1298         else
1299                 maxnonfragsize = mtu;
1300
1301         if (cork->length + length > maxnonfragsize - headersize) {
1302 emsgsize:
1303                 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1304                 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1305                 return -EMSGSIZE;
1306         }
1307
1308         /* CHECKSUM_PARTIAL only with no extension headers and when
1309          * we are not going to fragment
1310          */
1311         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1312             headersize == sizeof(struct ipv6hdr) &&
1313             length <= mtu - headersize &&
1314             (!(flags & MSG_MORE) || cork->gso_size) &&
1315             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1316                 csummode = CHECKSUM_PARTIAL;
1317
1318         if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1319                 sock_tx_timestamp(sk, sockc->tsflags, &tx_flags);
1320                 if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1321                     sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1322                         tskey = sk->sk_tskey++;
1323         }
1324
1325         /*
1326          * Let's try using as much space as possible.
1327          * Use MTU if total length of the message fits into the MTU.
1328          * Otherwise, we need to reserve fragment header and
1329          * fragment alignment (= 8-15 octects, in total).
1330          *
1331          * Note that we may need to "move" the data from the tail of
1332          * of the buffer to the new fragment when we split
1333          * the message.
1334          *
1335          * FIXME: It may be fragmented into multiple chunks
1336          *        at once if non-fragmentable extension headers
1337          *        are too large.
1338          * --yoshfuji
1339          */
1340
1341         cork->length += length;
1342         if (!skb)
1343                 goto alloc_new_skb;
1344
1345         while (length > 0) {
1346                 /* Check if the remaining data fits into current packet. */
1347                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1348                 if (copy < length)
1349                         copy = maxfraglen - skb->len;
1350
1351                 if (copy <= 0) {
1352                         char *data;
1353                         unsigned int datalen;
1354                         unsigned int fraglen;
1355                         unsigned int fraggap;
1356                         unsigned int alloclen;
1357                         unsigned int pagedlen = 0;
1358 alloc_new_skb:
1359                         /* There's no room in the current skb */
1360                         if (skb)
1361                                 fraggap = skb->len - maxfraglen;
1362                         else
1363                                 fraggap = 0;
1364                         /* update mtu and maxfraglen if necessary */
1365                         if (!skb || !skb_prev)
1366                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1367                                                     fragheaderlen, skb, rt,
1368                                                     orig_mtu);
1369
1370                         skb_prev = skb;
1371
1372                         /*
1373                          * If remaining data exceeds the mtu,
1374                          * we know we need more fragment(s).
1375                          */
1376                         datalen = length + fraggap;
1377
1378                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1379                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1380                         fraglen = datalen + fragheaderlen;
1381
1382                         if ((flags & MSG_MORE) &&
1383                             !(rt->dst.dev->features&NETIF_F_SG))
1384                                 alloclen = mtu;
1385                         else if (!paged)
1386                                 alloclen = fraglen;
1387                         else {
1388                                 alloclen = min_t(int, fraglen, MAX_HEADER);
1389                                 pagedlen = fraglen - alloclen;
1390                         }
1391
1392                         alloclen += dst_exthdrlen;
1393
1394                         if (datalen != length + fraggap) {
1395                                 /*
1396                                  * this is not the last fragment, the trailer
1397                                  * space is regarded as data space.
1398                                  */
1399                                 datalen += rt->dst.trailer_len;
1400                         }
1401
1402                         alloclen += rt->dst.trailer_len;
1403                         fraglen = datalen + fragheaderlen;
1404
1405                         /*
1406                          * We just reserve space for fragment header.
1407                          * Note: this may be overallocation if the message
1408                          * (without MSG_MORE) fits into the MTU.
1409                          */
1410                         alloclen += sizeof(struct frag_hdr);
1411
1412                         copy = datalen - transhdrlen - fraggap - pagedlen;
1413                         if (copy < 0) {
1414                                 err = -EINVAL;
1415                                 goto error;
1416                         }
1417                         if (transhdrlen) {
1418                                 skb = sock_alloc_send_skb(sk,
1419                                                 alloclen + hh_len,
1420                                                 (flags & MSG_DONTWAIT), &err);
1421                         } else {
1422                                 skb = NULL;
1423                                 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1424                                     2 * sk->sk_sndbuf)
1425                                         skb = alloc_skb(alloclen + hh_len,
1426                                                         sk->sk_allocation);
1427                                 if (unlikely(!skb))
1428                                         err = -ENOBUFS;
1429                         }
1430                         if (!skb)
1431                                 goto error;
1432                         /*
1433                          *      Fill in the control structures
1434                          */
1435                         skb->protocol = htons(ETH_P_IPV6);
1436                         skb->ip_summed = csummode;
1437                         skb->csum = 0;
1438                         /* reserve for fragmentation and ipsec header */
1439                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1440                                     dst_exthdrlen);
1441
1442                         /* Only the initial fragment is time stamped */
1443                         skb_shinfo(skb)->tx_flags = tx_flags;
1444                         tx_flags = 0;
1445                         skb_shinfo(skb)->tskey = tskey;
1446                         tskey = 0;
1447
1448                         /*
1449                          *      Find where to start putting bytes
1450                          */
1451                         data = skb_put(skb, fraglen - pagedlen);
1452                         skb_set_network_header(skb, exthdrlen);
1453                         data += fragheaderlen;
1454                         skb->transport_header = (skb->network_header +
1455                                                  fragheaderlen);
1456                         if (fraggap) {
1457                                 skb->csum = skb_copy_and_csum_bits(
1458                                         skb_prev, maxfraglen,
1459                                         data + transhdrlen, fraggap, 0);
1460                                 skb_prev->csum = csum_sub(skb_prev->csum,
1461                                                           skb->csum);
1462                                 data += fraggap;
1463                                 pskb_trim_unique(skb_prev, maxfraglen);
1464                         }
1465                         if (copy > 0 &&
1466                             getfrag(from, data + transhdrlen, offset,
1467                                     copy, fraggap, skb) < 0) {
1468                                 err = -EFAULT;
1469                                 kfree_skb(skb);
1470                                 goto error;
1471                         }
1472
1473                         offset += copy;
1474                         length -= copy + transhdrlen;
1475                         transhdrlen = 0;
1476                         exthdrlen = 0;
1477                         dst_exthdrlen = 0;
1478
1479                         if ((flags & MSG_CONFIRM) && !skb_prev)
1480                                 skb_set_dst_pending_confirm(skb, 1);
1481
1482                         /*
1483                          * Put the packet on the pending queue
1484                          */
1485                         if (!skb->destructor) {
1486                                 skb->destructor = sock_wfree;
1487                                 skb->sk = sk;
1488                                 wmem_alloc_delta += skb->truesize;
1489                         }
1490                         __skb_queue_tail(queue, skb);
1491                         continue;
1492                 }
1493
1494                 if (copy > length)
1495                         copy = length;
1496
1497                 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1498                     skb_tailroom(skb) >= copy) {
1499                         unsigned int off;
1500
1501                         off = skb->len;
1502                         if (getfrag(from, skb_put(skb, copy),
1503                                                 offset, copy, off, skb) < 0) {
1504                                 __skb_trim(skb, off);
1505                                 err = -EFAULT;
1506                                 goto error;
1507                         }
1508                 } else {
1509                         int i = skb_shinfo(skb)->nr_frags;
1510
1511                         err = -ENOMEM;
1512                         if (!sk_page_frag_refill(sk, pfrag))
1513                                 goto error;
1514
1515                         if (!skb_can_coalesce(skb, i, pfrag->page,
1516                                               pfrag->offset)) {
1517                                 err = -EMSGSIZE;
1518                                 if (i == MAX_SKB_FRAGS)
1519                                         goto error;
1520
1521                                 __skb_fill_page_desc(skb, i, pfrag->page,
1522                                                      pfrag->offset, 0);
1523                                 skb_shinfo(skb)->nr_frags = ++i;
1524                                 get_page(pfrag->page);
1525                         }
1526                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1527                         if (getfrag(from,
1528                                     page_address(pfrag->page) + pfrag->offset,
1529                                     offset, copy, skb->len, skb) < 0)
1530                                 goto error_efault;
1531
1532                         pfrag->offset += copy;
1533                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1534                         skb->len += copy;
1535                         skb->data_len += copy;
1536                         skb->truesize += copy;
1537                         wmem_alloc_delta += copy;
1538                 }
1539                 offset += copy;
1540                 length -= copy;
1541         }
1542
1543         if (wmem_alloc_delta)
1544                 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1545         return 0;
1546
1547 error_efault:
1548         err = -EFAULT;
1549 error:
1550         cork->length -= length;
1551         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1552         refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1553         return err;
1554 }
1555
1556 int ip6_append_data(struct sock *sk,
1557                     int getfrag(void *from, char *to, int offset, int len,
1558                                 int odd, struct sk_buff *skb),
1559                     void *from, int length, int transhdrlen,
1560                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1561                     struct rt6_info *rt, unsigned int flags,
1562                     const struct sockcm_cookie *sockc)
1563 {
1564         struct inet_sock *inet = inet_sk(sk);
1565         struct ipv6_pinfo *np = inet6_sk(sk);
1566         int exthdrlen;
1567         int err;
1568
1569         if (flags&MSG_PROBE)
1570                 return 0;
1571         if (skb_queue_empty(&sk->sk_write_queue)) {
1572                 /*
1573                  * setup for corking
1574                  */
1575                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1576                                      ipc6, rt, fl6);
1577                 if (err)
1578                         return err;
1579
1580                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1581                 length += exthdrlen;
1582                 transhdrlen += exthdrlen;
1583         } else {
1584                 fl6 = &inet->cork.fl.u.ip6;
1585                 transhdrlen = 0;
1586         }
1587
1588         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1589                                  &np->cork, sk_page_frag(sk), getfrag,
1590                                  from, length, transhdrlen, flags, ipc6, sockc);
1591 }
1592 EXPORT_SYMBOL_GPL(ip6_append_data);
1593
1594 static void ip6_cork_release(struct inet_cork_full *cork,
1595                              struct inet6_cork *v6_cork)
1596 {
1597         if (v6_cork->opt) {
1598                 kfree(v6_cork->opt->dst0opt);
1599                 kfree(v6_cork->opt->dst1opt);
1600                 kfree(v6_cork->opt->hopopt);
1601                 kfree(v6_cork->opt->srcrt);
1602                 kfree(v6_cork->opt);
1603                 v6_cork->opt = NULL;
1604         }
1605
1606         if (cork->base.dst) {
1607                 dst_release(cork->base.dst);
1608                 cork->base.dst = NULL;
1609                 cork->base.flags &= ~IPCORK_ALLFRAG;
1610         }
1611         memset(&cork->fl, 0, sizeof(cork->fl));
1612 }
1613
1614 struct sk_buff *__ip6_make_skb(struct sock *sk,
1615                                struct sk_buff_head *queue,
1616                                struct inet_cork_full *cork,
1617                                struct inet6_cork *v6_cork)
1618 {
1619         struct sk_buff *skb, *tmp_skb;
1620         struct sk_buff **tail_skb;
1621         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1622         struct ipv6_pinfo *np = inet6_sk(sk);
1623         struct net *net = sock_net(sk);
1624         struct ipv6hdr *hdr;
1625         struct ipv6_txoptions *opt = v6_cork->opt;
1626         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1627         struct flowi6 *fl6 = &cork->fl.u.ip6;
1628         unsigned char proto = fl6->flowi6_proto;
1629
1630         skb = __skb_dequeue(queue);
1631         if (!skb)
1632                 goto out;
1633         tail_skb = &(skb_shinfo(skb)->frag_list);
1634
1635         /* move skb->data to ip header from ext header */
1636         if (skb->data < skb_network_header(skb))
1637                 __skb_pull(skb, skb_network_offset(skb));
1638         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1639                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1640                 *tail_skb = tmp_skb;
1641                 tail_skb = &(tmp_skb->next);
1642                 skb->len += tmp_skb->len;
1643                 skb->data_len += tmp_skb->len;
1644                 skb->truesize += tmp_skb->truesize;
1645                 tmp_skb->destructor = NULL;
1646                 tmp_skb->sk = NULL;
1647         }
1648
1649         /* Allow local fragmentation. */
1650         skb->ignore_df = ip6_sk_ignore_df(sk);
1651
1652         *final_dst = fl6->daddr;
1653         __skb_pull(skb, skb_network_header_len(skb));
1654         if (opt && opt->opt_flen)
1655                 ipv6_push_frag_opts(skb, opt, &proto);
1656         if (opt && opt->opt_nflen)
1657                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1658
1659         skb_push(skb, sizeof(struct ipv6hdr));
1660         skb_reset_network_header(skb);
1661         hdr = ipv6_hdr(skb);
1662
1663         ip6_flow_hdr(hdr, v6_cork->tclass,
1664                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1665                                         ip6_autoflowlabel(net, np), fl6));
1666         hdr->hop_limit = v6_cork->hop_limit;
1667         hdr->nexthdr = proto;
1668         hdr->saddr = fl6->saddr;
1669         hdr->daddr = *final_dst;
1670
1671         skb->priority = sk->sk_priority;
1672         skb->mark = sk->sk_mark;
1673
1674         skb_dst_set(skb, dst_clone(&rt->dst));
1675         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1676         if (proto == IPPROTO_ICMPV6) {
1677                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1678
1679                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1680                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1681         }
1682
1683         ip6_cork_release(cork, v6_cork);
1684 out:
1685         return skb;
1686 }
1687
1688 int ip6_send_skb(struct sk_buff *skb)
1689 {
1690         struct net *net = sock_net(skb->sk);
1691         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1692         int err;
1693
1694         err = ip6_local_out(net, skb->sk, skb);
1695         if (err) {
1696                 if (err > 0)
1697                         err = net_xmit_errno(err);
1698                 if (err)
1699                         IP6_INC_STATS(net, rt->rt6i_idev,
1700                                       IPSTATS_MIB_OUTDISCARDS);
1701         }
1702
1703         return err;
1704 }
1705
1706 int ip6_push_pending_frames(struct sock *sk)
1707 {
1708         struct sk_buff *skb;
1709
1710         skb = ip6_finish_skb(sk);
1711         if (!skb)
1712                 return 0;
1713
1714         return ip6_send_skb(skb);
1715 }
1716 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1717
1718 static void __ip6_flush_pending_frames(struct sock *sk,
1719                                        struct sk_buff_head *queue,
1720                                        struct inet_cork_full *cork,
1721                                        struct inet6_cork *v6_cork)
1722 {
1723         struct sk_buff *skb;
1724
1725         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1726                 if (skb_dst(skb))
1727                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1728                                       IPSTATS_MIB_OUTDISCARDS);
1729                 kfree_skb(skb);
1730         }
1731
1732         ip6_cork_release(cork, v6_cork);
1733 }
1734
1735 void ip6_flush_pending_frames(struct sock *sk)
1736 {
1737         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1738                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1739 }
1740 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1741
1742 struct sk_buff *ip6_make_skb(struct sock *sk,
1743                              int getfrag(void *from, char *to, int offset,
1744                                          int len, int odd, struct sk_buff *skb),
1745                              void *from, int length, int transhdrlen,
1746                              struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1747                              struct rt6_info *rt, unsigned int flags,
1748                              struct inet_cork_full *cork,
1749                              const struct sockcm_cookie *sockc)
1750 {
1751         struct inet6_cork v6_cork;
1752         struct sk_buff_head queue;
1753         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1754         int err;
1755
1756         if (flags & MSG_PROBE)
1757                 return NULL;
1758
1759         __skb_queue_head_init(&queue);
1760
1761         cork->base.flags = 0;
1762         cork->base.addr = 0;
1763         cork->base.opt = NULL;
1764         cork->base.dst = NULL;
1765         v6_cork.opt = NULL;
1766         err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
1767         if (err) {
1768                 ip6_cork_release(cork, &v6_cork);
1769                 return ERR_PTR(err);
1770         }
1771         if (ipc6->dontfrag < 0)
1772                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1773
1774         err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
1775                                 &current->task_frag, getfrag, from,
1776                                 length + exthdrlen, transhdrlen + exthdrlen,
1777                                 flags, ipc6, sockc);
1778         if (err) {
1779                 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
1780                 return ERR_PTR(err);
1781         }
1782
1783         return __ip6_make_skb(sk, &queue, cork, &v6_cork);
1784 }